pun-sort: make language-gnostic

This commit is contained in:
2025-12-28 21:22:13 +01:00
parent 612acab4fc
commit c50502225e
2 changed files with 38 additions and 10 deletions

View File

@@ -1,10 +1,10 @@
#!/bin/sh #!/bin/sh
set -efu set -efu
tokenize() { text_en="Once upon a time, in a quiet village nestled between rolling hills and sparkling rivers, there lived a clever fox named Felix. Felix was known throughout the village for his cunning tricks and playful antics. Every morning, he would sneak through the meadows, darting past rabbits and chickens, always careful to avoid the farmer's watchful eyes. Despite his mischievous ways, Felix had a kind heart and often shared his clever solutions with friends in need. One day, a heavy storm swept through the valley, leaving many paths muddy and rivers swollen. Felix saw his chance to help: he guided lost ducklings back to their pond, and showed the frightened kittens how to find shelter under the sturdy oak trees. The villagers watched in amazement as the fox moved gracefully through the rain-soaked fields, his orange fur glistening and his sharp eyes alert. By the time the storm passed, the village had gained a newfound respect for Felix. Tales of his bravery spread far and wide, carried by wandering merchants and whispered by children as they played near the cobblestone streets. Nights in the village were quiet once more, but the story of Felix, the fox who danced through storm and shadow, continued to inspire laughter, cleverness, and courage in the hearts of all who heard it."
tr -cs '[:alpha:]' '\n' | tr '[:upper:]' '[:lower:]'
}
text="Once upon a time, in a quiet village nestled between rolling hills and sparkling rivers, there lived a clever fox named Felix. Felix was known throughout the village for his cunning tricks and playful antics. Every morning, he would sneak through the meadows, darting past rabbits and chickens, always careful to avoid the farmer's watchful eyes. Despite his mischievous ways, Felix had a kind heart and often shared his clever solutions with friends in need. One day, a heavy storm swept through the valley, leaving many paths muddy and rivers swollen. Felix saw his chance to help: he guided lost ducklings back to their pond, and showed the frightened kittens how to find shelter under the sturdy oak trees. The villagers watched in amazement as the fox moved gracefully through the rain-soaked fields, his orange fur glistening and his sharp eyes alert. By the time the storm passed, the village had gained a newfound respect for Felix. Tales of his bravery spread far and wide, carried by wandering merchants and whispered by children as they played near the cobblestone streets. Nights in the village were quiet once more, but the story of Felix, the fox who danced through storm and shadow, continued to inspire laughter, cleverness, and courage in the hearts of all who heard it." echo "$text_en" | python3 sort.py
echo "$text" | tokenize | python3 sort.py text_de="In einem kleinen Dorf, versteckt zwischen sanften Hügeln und klaren Bächen, lebte ein listiger Fuchs namens Fritz. Fritz war bekannt für seine cleveren Streiche und seine verspielte Natur. Jeden Morgen schlich er durch die Wiesen, huschte an Hasen und Hühnern vorbei und achtete dabei genau auf die wachsamen Augen des Bauern. Trotz seiner schelmischen Art hatte Fritz ein gutes Herz und half oft Freunden in Not. Eines Tages fegte ein heftiger Sturm durch das Tal, die Wege wurden schlammig und die Flüsse traten über die Ufer. Fritz sah seine Chance, zu helfen: Er führte verlorene Entenküken zurück zu ihrem Teich und zeigte den ängstlichen Kätzchen, wie sie Schutz unter den starken Eichen finden konnten. Die Dorfbewohner beobachteten erstaunt, wie der Fuchs anmutig durch die regengetränkten Felder sprang, sein orangefarbenes Fell glänzte und seine scharfen Augen waren stets wachsam. Nachdem der Sturm vorübergezogen war, gewann das Dorf großen Respekt für Fritz. Geschichten über seine Tapferkeit verbreiteten sich weit und breit, getragen von reisenden Händlern und von Kindern, die beim Spielen auf den Kopfsteinpflasterstraßen flüsterten. Die Nächte im Dorf waren wieder ruhig, aber die Erzählung von Fritz, dem Fuchs, der durch Sturm und Schatten tanzte, inspirierte weiterhin Lachen, Cleverness und Mut in den Herzen aller, die davon hörten."
echo "$text_de" | python3 sort.py -l de

View File

@@ -1,17 +1,19 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import sys
import string
import subprocess import subprocess
from functools import lru_cache from functools import lru_cache
import argparse
# ------------------------- # -------------------------
# IPA helpers # IPA helpers
# ------------------------- # -------------------------
def get_ipa(word): def get_ipa(word, lang="en"):
try: try:
out = subprocess.check_output( out = subprocess.check_output(
["espeak-ng", "-q", "--ipa=3", word], ["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
stderr=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
text=True text=True
) )
@@ -84,7 +86,7 @@ def seriate(words, ipas):
while unused: while unused:
cur = path[-1] cur = path[-1]
nxt = min( nxt = min(
unused, unused,
key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1) key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
) )
path.append(nxt) path.append(nxt)
@@ -96,9 +98,35 @@ def seriate(words, ipas):
# Main # Main
# ------------------------- # -------------------------
def tokenize_stdin():
"""
Reads stdin and returns a list of lowercase words.
Handles:
- Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
- Ignores punctuation
"""
text = sys.stdin.read()
tokens = text.translate(str.maketrans('', '', string.punctuation)).split()
return tokens
def main(): def main():
words = [w.strip() for w in sys.stdin if w.strip()] parser = argparse.ArgumentParser(description="Pun-sort words by phonetic similarity")
ipas = {w: tuple(ipa_tokenize(get_ipa(w))) for w in words} parser.add_argument(
"--lang", "-l",
type=str,
default="en",
help="Language code for espeak-ng (default: en)"
)
args = parser.parse_args()
LANG = args.lang
words = tokenize_stdin()
words = list(dict.fromkeys(words))
ipas = {w: tuple(ipa_tokenize(get_ipa(w, lang=LANG))) for w in words}
ordered = seriate(words, ipas) ordered = seriate(words, ipas)