From 688e141047e831a7e5f58dd24e00de51a3cb2345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 16 Aug 2022 20:22:46 +0200 Subject: [PATCH] greek-morphology: add csv --- .gitignore | 2 + greek-morphology/extract.py | 79 +++++++++++++++++++++++++++++++++++++ greek-morphology/fetch.sh | 6 +++ 3 files changed, 87 insertions(+) create mode 100755 greek-morphology/extract.py create mode 100755 greek-morphology/fetch.sh diff --git a/.gitignore b/.gitignore index 67ee63c..5b81063 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ seh2g/Gesamtkatalog (11.03.2020).xml build stoepel_cache.sqlite result +input.txt +greek.csv diff --git a/greek-morphology/extract.py b/greek-morphology/extract.py new file mode 100755 index 0000000..39ea915 --- /dev/null +++ b/greek-morphology/extract.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +import unicodedata +import sys +import csv +from tqdm.auto import tqdm + +PATH = "input.txt" + + +def parse_interpretation(interpretation): + try: + lemma, translation, analysis = interpretation.split(" : ") + return { + "lemma": lemma.strip() or None, + "translation": translation.strip() or None, + "analysis": analysis.strip() or None, + } + except: + print(interpretation, file=sys.stderr) + sys.exit(1) + + +def purge_accents(string): + return ( + unicodedata.normalize("NFD", string) + .lower() + .translate(dict([key, None] for key in range(0x300, 0x380))) + ) + + +def parse(plain_text): + print("parsing", file=sys.stderr) + result = [] + for entry in tqdm(plain_text.split("\n\n")): + key_value = entry.split("\n") + key = key_value[0] + value = key_value[1:] + result.append( + { + "form": key.split("|"), + "simple": purge_accents(key.split("|")[0]), + "interpretations": [ + parse_interpretation(interpretation) + for interpretations in value + for interpretation in interpretations.split("
") + ], + } + ) + return result + + +with open(PATH, "r") as file: + plain_text = file.read() + + forms = [f"form{n}" for n in range(1, 6)] + fieldnames = ( + ["simple"] + + forms + + [ + "lemma", + "translation", + "analysis", + ] + ) + + csv_writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames) + csv_writer.writeheader() + print("writing", file=sys.stderr) + for word in tqdm(parse(plain_text)): + for interpretation in word["interpretations"]: + csv_writer.writerow( + dict(zip(forms, word["form"])) + | { + "simple": word["simple"], + "lemma": interpretation["lemma"], + "translation": interpretation["translation"], + "analysis": interpretation["analysis"], + } + ) diff --git a/greek-morphology/fetch.sh b/greek-morphology/fetch.sh new file mode 100755 index 0000000..81a4e50 --- /dev/null +++ b/greek-morphology/fetch.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +FILE=$(mktemp) +OUTDIR=$(mktemp -d) +curl -sSL 'https://www.mobileread.com/forums/attachment.php?attachmentid=135188&d=1424683732' -o "$FILE" +unzip "$FILE" -d "$OUTDIR" +sed '/^#/d' "$OUTDIR"/*.txt > input.txt