# Imports
from collections import Counter
from natsort import natsorted
import spacy
from cltkreaders.lat import LatinTesseraeCorpusReader
from latintools import preprocess
from tabulate import tabulate
from IPython.core.display import HTML
from IPython.display import display
from tqdm import tqdm
In the first part of his 1996 monograph on repetition in Latin poetry, Jeffrey Wills (1996) discusses gemination. i.e. “the repetition of a word in the same form in the same clause with no additional expansion.” In this notebook, we will formalize Wills’ definition of gemination into code using LatinCy.
Let’s start by setting up a code notebook with Python imports, etc. We will use CLTK Readers with the CLTK-Tesserae texts as our exploratory background for gemination.
Wills uses the following line from Virgil’s Eclogues to illustrate gemination (V. Ecl. 2.69):
Corydon, Corydon, quae te dementia cepit!
Let’s begin there.
# Set up corpus
= LatinTesseraeCorpusReader()
T
# Get Eclogues file
= [file for file in T.fileids() if 'eclogues' in file][0]
eclogues print(eclogues)
vergil.eclogues.tess
Next we load a LatinCy model to assist with matching Latin wordforms.
# Set up NLP
= spacy.load('la_core_web_lg') nlp
Simple gemination
Considering Wills’ basic definition from above, we can use the following pseudocode as a starting point…
- Get a line of Virgil
- Create a LatinCy Doc for each line
- Count the
norm
token attributes for each line
- Check
norm
count, i.e. if the count ofnorm
token attributes is greater than 1, then the line has gemination
Note that Wills specifically defines the scope of gemination as a clause (not a line); we will return to this point in a future notebook where we introduce some clause parsing.
Get a line of Virgil
For the Tesserae texts, CLTK Readers has a data structure called doc_rows
that, at least for poetry, gives us a dictionary with the format {citation: line, etc.}. Let’s get the docrows for the Eclogues and print a sample line.
# Get all Eclogue rows
= next(T.doc_rows(eclogues)) docrows
# Get a row
= docrows['<verg. ecl. 2.69>']
test print(test)
Ah, Corydon, Corydon, quae te dementia cepit!
We can already see our gemination—specifically the example Wills uses in his defintion—with the repetition of Corydon.
Create a LatinCy Doc
Next we can create a spaCy Doc for each line. The Doc contains all sorts of annotations useful for philological work. We will use the norm
token attribute here to help us match wordforms.
# Create LatinCy Doc for line
= nlp(test)
doc print(type(doc))
<class 'spacy.tokens.doc.Doc'>
# Print norm examples
print(tabulate([[token.i, token.text, token.norm_] for token in doc], headers=['Index','Token', 'Norm']))
Index Token Norm
------- -------- --------
0 Ah ah
1 , ,
2 Corydon corydon
3 , ,
4 Corydon corydon
5 , ,
6 quae quae
7 te te
8 dementia dementia
9 cepit cepit
10 ! !
Thinking ahead, if we use lines as is from the Tesserae texts, we have to deal with punctuation. Wills is concerned with the repetition of Corydon, not the repetition of the commas! One way we can deal with this is to preprocess the lines to remove punctuation before creating the Docs. We will discuss the philological implications of preprocessing in a future notebook. For now, we are going to import a script called preprocess
that removes punctuation.
# Create LatinCy Doc for preprocessed line and print example
= nlp(preprocess(test, lower=False))
doc print(tabulate([[token.i, token.text, token.norm_] for token in doc], headers=['Index','Token', 'Norm']))
Index Token Norm
------- -------- --------
0 Ah ah
1 Corydon corydon
2 Corydon corydon
3 quae quae
4 te te
5 dementia dementia
6 cepit cepit
Count the norm
token attributes
We can now count the norm
token attributes for each line using a Counter
from the collections
module.
# Count `norm` attr in Doc tokens
= [token.norm_ for token in doc]
norms = Counter(norms)
norms_counter print(norms_counter)
Counter({'corydon': 2, 'ah': 1, 'quae': 1, 'te': 1, 'dementia': 1, 'cepit': 1})
Check norm
count
We can now check the norm
count for each line. If the count is greater than 1, then the line has gemination.
= [k for k, v in norms_counter.items() if v > 1]
geminations print(f'Number of geminations: {len(geminations)}')
print(f'{geminations}')
Number of geminations: 1
['corydon']
We knew from Wills that this line would have gemination; of course, not all lines do.
# Try a different line
= docrows['<verg. ecl. 2.70>']
test = nlp(preprocess(test))
doc = [token.norm_ for token in doc]
norms = Counter(norms)
norms_counter = [k for k, v in norms_counter.items() if v > 1]
geminations
print(doc.text)
print(f'Number of geminations: {len(geminations)}')
print(f'{geminations}')
semiputata tibi frondosa uitis in ulmo est
Number of geminations: 0
[]
Having worked through our pseudocode, we can now put it all together into a function that we can use to check for gemination in any line of Latin poetry.
def get_geminations(Doc):
= [token.norm_ for token in Doc]
norms = Counter(norms)
norms_counter = [k for k, v in norms_counter.items() if v > 1]
geminations return geminations
Using this function, we can loop through the docrows for the Eclogues and check for gemination in each line. In the example below, we break
after the first match as we are only checking at this point that the function works as expected.
for k, v in docrows.items():
= nlp(preprocess(v, lower=False))
doc = get_geminations(doc)
geminations if len(geminations) > 0:
print(f'{k}: {geminations}')
print(f'{v}')
print('\n')
break
<verg. ecl. 1.23>: ['sic']
sic canibus catulos similis, sic matribus haedos
More useful of course would be to collect all of the geminations into a data structure like a dictionary…
= {}
virgil_geminations
for k, v in tqdm(docrows.items()):
= nlp(preprocess(v))
doc = get_geminations(doc)
geminations if geminations:
= (v, geminations)
virgil_geminations[k]
print(f'There are {len(virgil_geminations)} geminations in Virgil\'s *Eclogues*.')
100%|██████████| 828/828 [00:03<00:00, 207.68it/s]
There are 105 geminations in Virgil's *Eclogues*.
print('Here are the first five examples from our search:\n')
for k, v in list(virgil_geminations.items())[:5]:
print(f'{k}: {v[0]}')
Here are the first five examples from our search:
<verg. ecl. 1.23>: sic canibus catulos similis, sic matribus haedos
<verg. ecl. 1.33>: nec spes libertatis erat, nec cura peculi:
<verg. ecl. 1.63>: aut Ararim Parthus bibet, aut Germania Tigrim,
<verg. ecl. 1.75>: Ite meae, felix quondam pecus, ite capellae.
<verg. ecl. 2.20>: quam dives pecoris, nivei quam lactis abundans.
Note V. Ecl. 1.75 as an example of why we use norm
instead of text
for matching wordforms. Ite is capitalized here only because it is the first word in the sentence, but should be matched against ite regardless of case. Note the following in Python string matching…
print('Ite' == 'ite')
print('ite' == 'ite')
False
True
We can make it easier to see gemination in our texts by formatting matched tokens in HTML. We can use the display
module from the IPython
package to display the HTML in the notebook.
def display_gemination(gemination):
= ''
html = nlp(gemination[0])
line = gemination[1]
terms
for token in line:
if token.norm_ in terms:
= f'<span style="color: green;">{token}</span>'
token += f'{token} '
html return html
print('Here are the first five examples from our search:')
for k, v in list(virgil_geminations.items())[:5]:
# Note that if you do not remove the angle brackets from the Tesserae citation, it will be ignored as a (bad) HTML tag in the formatting below.
= k.replace('<', '').replace('>', '')
citation = f'<span style="color: black; font-weight: bold;">{citation}</span>'
citation = display_gemination(v)
text = '<br>'.join([citation, text])
html += '<br><br>'
html display(HTML(html))
Here are the first five examples from our search:
sic canibus catulos similis , sic matribus haedos
nec spes libertatis erat , nec cura peculi :
aut Ararim Parthus bibet , aut Germania Tigrim ,
Ite meae , felix quondam pecus , ite capellae .
quam dives pecoris , nivei quam lactis abundans .
Moreover, we can write these matches to a file, formatting the geminations to make them easier to spot, here wrapping repeitions with asterisks.
def format_gemination(gemination):
= ''
txt = nlp(gemination[0])
line = gemination[1]
terms
for token in line:
if token.norm_ in terms:
= f'*{token}*'
token += f'{token} '
txt return txt
with open('eclogue_geminations.txt', 'w') as f:
for k, v in virgil_geminations.items():
= k.replace('<', '').replace('>', '')
citation = f'{citation}'
citation = format_gemination(v)
text f'{citation}\t{text}\n') f.write(
Note that a line like Ecl. 4.51 appears in the output…
terrasque tractusque maris caelumque profundum !
…as que is considered a token in the LatinCy model.
Accordingly, we may want to have a way to drop certain tokens from our matching process. We add below an exclude
parameter to the get_geminations
function to accomplish this.
def get_geminations(Doc, exclude=[]):
= [token.norm_ for token in Doc]
norms = Counter(norms)
norms_counter = [k for k, v in norms_counter.items() if v > 1 and k not in exclude]
geminations return geminations
=['que']
exclude
= nlp(preprocess(docrows['<verg. ecl. 4.51>'], lower=False))
test
print('Before...')
print(get_geminations(test))
print()
print('After...')
print(get_geminations(test, exclude=exclude))
Before...
['que']
After...
[]
We write to file again, this time excluding que.
= {}
virgil_geminations
for k, v in tqdm(docrows.items()):
= nlp(preprocess(v))
doc = get_geminations(doc, exclude=['que'])
geminations if geminations:
= (v, geminations)
virgil_geminations[k]
with open('eclogue_geminations.txt', 'w') as f:
for k, v in virgil_geminations.items():
= k.replace('<', '').replace('>', '')
citation = f'{citation}'
citation = format_gemination(v)
text f'{citation}\t{text}\n') f.write(
100%|██████████| 828/828 [00:03<00:00, 213.19it/s]
So far, we have worked only with the Eclogues. We could easily expand this gemination search to other texts in the Tesserae corpus. Here is an example of expanding it to all epic poems in the collection.
# Geminations in all Latin epic
# Note here I get the year from the Tesserae metadata, sort the files chronologically, and then discard the date information
= natsorted([(file, int(T.metadata('date', file))) for file in T.fileids() if T.metadata('genre', file) == 'epic'], key=lambda x: x[1])
epic = [file for file, _ in epic]
epic print(f'There are {len(epic)} epic poems in the Tesserae collection.')
There are 120 epic poems in the Tesserae collection.
# This takes about 7 minutes on my laptop
= {}
all_geminations
for file in tqdm(epic):
= next(T.doc_rows(file))
docrows for k, v in docrows.items():
= nlp(preprocess(v))
doc = get_geminations(doc, exclude=['que'])
geminations if geminations:
= (v, geminations) all_geminations[k]
100%|██████████| 120/120 [06:49<00:00, 3.42s/it]
# Write to file
with open('epic_geminations.tsv', 'w') as f:
'citation\ttext\n')
f.write(for k, v in all_geminations.items():
= k.replace('<', '').replace('>', '')
citation = f'{citation}'
citation = format_gemination(v)
text f'{citation}\t{text}\n') f.write(
This has been an introduction to formalizing a literary critical/philological argument using LatinCy, an example that barely takes us past the first page of Wills Part I. In subsequent notebooks, we will explore variations on gemination and other types of repetition.