# Imports
import unicodedata
import urllib.request
from collections import Counter
from random import sample
from lxml import etree
from pyuca import CollatorThis post was originally published on GitHub under the perseus-experiments repo in September 2019. It has been edited and updated for republication here: main changes include improved handling of Greek unicode precombined characters.
What does it take to compile a list of Homeric hapax legomena—words that appear exactly once across the Iliad and the Odyssey? With a fully lemmatized treebank in hand, it becomes a more-or-less straightforward counting exercise in Python.
This notebook pulls the Ancient Greek and Latin Dependency Treebank (AGLDT v2.1) of Homer from PerseusDL/treebank_data, iterates over every annotated <word>, collects the lemma attribute, and counts. The hapaxes fall out of collections.Counter for free; the only nontrivial step is sorting them—sorted() on Greek strings will not give you the order you want, so we hand off to James Tauber’s pyuca for proper Unicode collation.
# pyuca's Collator implements UCA-correct sort keys for Greek
c = Collator()# AGLDT URIs for Homer's Iliad (tlg0012.tlg001) and Odyssey (tlg0012.tlg002)
treebank_base = (
'https://raw.githubusercontent.com/PerseusDL/treebank_data/master/v2.1/Greek/texts/'
)
works = ['tlg0012.tlg001', 'tlg0012.tlg002']
uris = [f'{treebank_base}{w}.perseus-grc1.tb.xml' for w in works]def get_words(uri):
with urllib.request.urlopen(uri) as f:
tree = etree.parse(f)
return tree.getroot().xpath('.//word')
words = []
for uri in uris:
words.extend(get_words(uri))
print(f"There are {len(words)} 'words' in the AGLDT version of Homer's *Iliad* and *Odyssey*.")There are 236091 'words' in the AGLDT version of Homer's *Iliad* and *Odyssey*.
Cleaning legacy encoding artifacts in the AGLDT lemmas
Most AGLDT v2.1 lemmas come through as clean precomposed Unicode, but a handful retain pre-Unicode noise that splits real lemmas across multiple keys and surfaces as false hapaxes. The patterns we find here are: a ~ standing in for a circumflex (κει~νος for κεῖνος); a leading standalone breathing mark for a lowercase common word (ʽὡς for ὡς); a leading combining breathing on a lowercased proper noun whose canonical form is capitalized (̓ἀχιλλεύς rather than Ἀχιλλεύς); and two stray noisy entries.
The function below normalizes each of these — dropping the placeholders, stripping the breathing-prefix variants, converting ~ to a combining circumflex, NFC-normalizing the result, and re-capitalizing proper nouns whose initial breathing was the combining variant. Without this step παῖς (which occurs 296 times in the corpus) reports a separate παι~ς as a hapax, and Ἀχιλλεύς accumulates a damaged ̓ἀχιλλεύς ghost-hapax of his own.
# Normalize AGLDT lemma encoding artifacts before counting
# Standalone breathing-mark prefixes (separate visible chars)
LEADING_STANDALONE = ('\u02BD', '\u02BC', '\u1FDE', '\u1FDD')
# Combining breathing-mark prefixes (zero-width modifiers); when these
# appear at the start of a lemma it is invariably an AGLDT damaged
# proper-noun encoding (combining breathing + lowercase initial),
# whose canonical form is title-cased.
LEADING_COMBINING = ('\u0313', '\u0314')
# Noisy lemmas to drop entirely
GARBAGE = {'???', '\u0313"\u0313'}
def normalize_lemma(lemma):
if not lemma or lemma in GARBAGE:
return None
combining_stripped = False
while lemma and lemma[0] in LEADING_STANDALONE + LEADING_COMBINING:
if lemma[0] in LEADING_COMBINING:
combining_stripped = True
lemma = lemma[1:]
if '~' in lemma:
lemma = lemma.replace('~', '\u0342') # combining circumflex
lemma = unicodedata.normalize('NFC', lemma)
if combining_stripped and lemma and lemma[0].islower():
lemma = lemma[0].upper() + lemma[1:]
return lemma or None# Get forms and lemmas from word elements, normalizing each lemma.
def get_lemma(word):
raw = word.attrib.get('lemma')
return normalize_lemma(raw) if raw else None
forms = [word.attrib['form'] for word in words]
lemmas = [get_lemma(word) for word in words]
unique_forms = sorted(set(forms))
unique_lemmas = sorted({lemma for lemma in lemmas if lemma})
print(f'There are {len(unique_forms)} unique forms in the AGLDT Homer.')
print(f'There are {len(unique_lemmas)} unique lemmas in the AGLDT Homer.')There are 30815 unique forms in the AGLDT Homer.
There are 8794 unique lemmas in the AGLDT Homer.
# Get hapaxes
hapaxes = [lemma for lemma, count in Counter(lemmas).most_common() if count == 1]
print(f'There are {len(hapaxes)} hapaxes in the AGLDT Homer.')
print(f'A sample of AGLDT hapaxes includes:\n {sample(hapaxes, 10)}')There are 2972 hapaxes in the AGLDT Homer.
A sample of AGLDT hapaxes includes:
['Κλύτιος', 'Ἕλλην', 'προλέγω', 'κατασχεθεῖν', 'ὑπεκπρολύω', 'ἄποινος', 'εὐειδής', 'αἴητος', 'ἐπίστροφος', 'ἐννήκοντα']
Sorting Greek strings with sorted() alone will use code-point order, which puts smooth-breathing forms before rough-breathing ones in ways that no Greek reader expects. pyuca.Collator produces sort keys that follow the Unicode Collation Algorithm, giving the alphabetical order you’d actually want.
for hapax in sorted(hapaxes, key=c.sort_key)[:25]:
print(hapax)ἀαγής
ἀβακέω
Ἀβαρβαρέη
Ἄβληρος
ἀβλής
ἄβλητος
ἄβρομος
ἀβροτάζω
ἄβροτος
Ἀβυδόθεν
Ἀβυδόθι
Ἄβυδος
Ἀγάθων
ἀγαίομαι
Ἀγαμεμνονίδης
Ἀγαμήδη
ἄγαμος
ἀγανόφρων
Ἀγαπήνωρ
Ἀγασθένης
ἀγάστονος
Ἀγαυή
ἀγγελίης
ἀγεληδόν
ἀγέραστος
with open('data/homeric_hapaxes.txt', 'w') as f:
for hapax in sorted(hapaxes, key=c.sort_key):
f.write(f'{hapax}\n')