Files

138 lines
3.0 KiB
Python
Raw Permalink Normal View History

2025-12-28 21:06:24 +01:00
#!/usr/bin/env python3
import sys
2025-12-28 21:22:13 +01:00
import string
2025-12-28 21:06:24 +01:00
import subprocess
from functools import lru_cache
2025-12-28 21:22:13 +01:00
import argparse
2025-12-28 21:06:24 +01:00
# -------------------------
# IPA helpers
# -------------------------
2025-12-28 21:22:13 +01:00
def get_ipa(word, lang="en"):
2025-12-28 21:06:24 +01:00
try:
out = subprocess.check_output(
2025-12-28 21:22:13 +01:00
["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
2025-12-28 21:06:24 +01:00
stderr=subprocess.DEVNULL,
text=True
)
return out.strip().strip("/")
except Exception:
return ""
def ipa_tokenize(ipa):
tokens = []
i = 0
while i < len(ipa):
ch = ipa[i]
if ch in "ˈˌ":
i += 1
continue
if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "", "eɪ", "", "ɔɪ"}:
tokens.append(ipa[i:i+2])
i += 2
else:
tokens.append(ch)
i += 1
return tokens
# -------------------------
# Distance
# -------------------------
VOWELS = set("aeiouəɪʊɔɛɜɑæ")
def sub_cost(a, b):
if a == b:
return 0.0
if a in VOWELS and b in VOWELS:
return 0.6
if a in VOWELS or b in VOWELS:
return 2.0
return 1.0
@lru_cache(maxsize=None)
def phonetic_distance(a, b):
a = tuple(a)
b = tuple(b)
n, m = len(a), len(b)
dp = [[0] * (m + 1) for _ in range(n + 1)]
for i in range(n + 1):
dp[i][0] = i
for j in range(m + 1):
dp[0][j] = j
for i in range(1, n + 1):
for j in range(1, m + 1):
dp[i][j] = min(
dp[i - 1][j] + 1,
dp[i][j - 1] + 1,
dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1])
)
return dp[n][m]
# -------------------------
# Seriation
# -------------------------
def seriate(words, ipas):
unused = set(words)
path = [words[0]]
unused.remove(words[0])
while unused:
cur = path[-1]
nxt = min(
2025-12-28 21:22:13 +01:00
unused,
2025-12-28 21:06:24 +01:00
key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
)
path.append(nxt)
unused.remove(nxt)
return path
# -------------------------
# Main
# -------------------------
2025-12-28 21:22:13 +01:00
def tokenize_stdin():
"""
Reads stdin and returns a list of lowercase words.
Handles:
- Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
- Ignores punctuation
"""
text = sys.stdin.read()
tokens = text.translate(str.maketrans('', '', string.punctuation)).split()
return tokens
2025-12-28 21:06:24 +01:00
def main():
2025-12-28 21:22:13 +01:00
parser = argparse.ArgumentParser(description="Pun-sort words by phonetic similarity")
parser.add_argument(
"--lang", "-l",
type=str,
default="en",
help="Language code for espeak-ng (default: en)"
)
args = parser.parse_args()
LANG = args.lang
words = tokenize_stdin()
words = list(dict.fromkeys(words))
ipas = {w: tuple(ipa_tokenize(get_ipa(w, lang=LANG))) for w in words}
2025-12-28 21:06:24 +01:00
ordered = seriate(words, ipas)
for w in ordered:
print(f"{w}\t/{''.join(ipas[w])}/")
if __name__ == "__main__":
main()