greek-morphology: add csv

This commit is contained in:
2022-08-16 20:22:46 +02:00
parent d5a38cf84e
commit 688e141047
3 changed files with 87 additions and 0 deletions

2
.gitignore vendored
View File

@@ -6,3 +6,5 @@ seh2g/Gesamtkatalog (11.03.2020).xml
build build
stoepel_cache.sqlite stoepel_cache.sqlite
result result
input.txt
greek.csv

79
greek-morphology/extract.py Executable file
View File

@@ -0,0 +1,79 @@
#!/usr/bin/env python3
import unicodedata
import sys
import csv
from tqdm.auto import tqdm
PATH = "input.txt"
def parse_interpretation(interpretation):
try:
lemma, translation, analysis = interpretation.split(" : ")
return {
"lemma": lemma.strip() or None,
"translation": translation.strip() or None,
"analysis": analysis.strip() or None,
}
except:
print(interpretation, file=sys.stderr)
sys.exit(1)
def purge_accents(string):
return (
unicodedata.normalize("NFD", string)
.lower()
.translate(dict([key, None] for key in range(0x300, 0x380)))
)
def parse(plain_text):
print("parsing", file=sys.stderr)
result = []
for entry in tqdm(plain_text.split("\n\n")):
key_value = entry.split("\n")
key = key_value[0]
value = key_value[1:]
result.append(
{
"form": key.split("|"),
"simple": purge_accents(key.split("|")[0]),
"interpretations": [
parse_interpretation(interpretation)
for interpretations in value
for interpretation in interpretations.split("<br>")
],
}
)
return result
with open(PATH, "r") as file:
plain_text = file.read()
forms = [f"form{n}" for n in range(1, 6)]
fieldnames = (
["simple"]
+ forms
+ [
"lemma",
"translation",
"analysis",
]
)
csv_writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
csv_writer.writeheader()
print("writing", file=sys.stderr)
for word in tqdm(parse(plain_text)):
for interpretation in word["interpretations"]:
csv_writer.writerow(
dict(zip(forms, word["form"]))
| {
"simple": word["simple"],
"lemma": interpretation["lemma"],
"translation": interpretation["translation"],
"analysis": interpretation["analysis"],
}
)

6
greek-morphology/fetch.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/usr/bin/env bash
FILE=$(mktemp)
OUTDIR=$(mktemp -d)
curl -sSL 'https://www.mobileread.com/forums/attachment.php?attachmentid=135188&d=1424683732' -o "$FILE"
unzip "$FILE" -d "$OUTDIR"
sed '/^#/d' "$OUTDIR"/*.txt > input.txt