2025-12-28 21:06:24 +01:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
import sys
|
2025-12-28 21:22:13 +01:00
|
|
|
|
import string
|
2025-12-28 21:06:24 +01:00
|
|
|
|
import subprocess
|
|
|
|
|
|
from functools import lru_cache
|
2025-12-28 21:22:13 +01:00
|
|
|
|
import argparse
|
2025-12-28 21:06:24 +01:00
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# IPA helpers
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
|
2025-12-28 21:22:13 +01:00
|
|
|
|
def get_ipa(word, lang="en"):
|
2025-12-28 21:06:24 +01:00
|
|
|
|
try:
|
|
|
|
|
|
out = subprocess.check_output(
|
2025-12-28 21:22:13 +01:00
|
|
|
|
["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
|
2025-12-28 21:06:24 +01:00
|
|
|
|
stderr=subprocess.DEVNULL,
|
|
|
|
|
|
text=True
|
|
|
|
|
|
)
|
|
|
|
|
|
return out.strip().strip("/")
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
def ipa_tokenize(ipa):
|
|
|
|
|
|
tokens = []
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(ipa):
|
|
|
|
|
|
ch = ipa[i]
|
|
|
|
|
|
if ch in "ˈˌ":
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}:
|
|
|
|
|
|
tokens.append(ipa[i:i+2])
|
|
|
|
|
|
i += 2
|
|
|
|
|
|
else:
|
|
|
|
|
|
tokens.append(ch)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# Distance
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
|
|
|
|
|
|
VOWELS = set("aeiouəɪʊɔɛɜɑæ")
|
|
|
|
|
|
|
|
|
|
|
|
def sub_cost(a, b):
|
|
|
|
|
|
if a == b:
|
|
|
|
|
|
return 0.0
|
|
|
|
|
|
if a in VOWELS and b in VOWELS:
|
|
|
|
|
|
return 0.6
|
|
|
|
|
|
if a in VOWELS or b in VOWELS:
|
|
|
|
|
|
return 2.0
|
|
|
|
|
|
return 1.0
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=None)
|
|
|
|
|
|
def phonetic_distance(a, b):
|
|
|
|
|
|
a = tuple(a)
|
|
|
|
|
|
b = tuple(b)
|
|
|
|
|
|
n, m = len(a), len(b)
|
|
|
|
|
|
dp = [[0] * (m + 1) for _ in range(n + 1)]
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(n + 1):
|
|
|
|
|
|
dp[i][0] = i
|
|
|
|
|
|
for j in range(m + 1):
|
|
|
|
|
|
dp[0][j] = j
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(1, n + 1):
|
|
|
|
|
|
for j in range(1, m + 1):
|
|
|
|
|
|
dp[i][j] = min(
|
|
|
|
|
|
dp[i - 1][j] + 1,
|
|
|
|
|
|
dp[i][j - 1] + 1,
|
|
|
|
|
|
dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1])
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return dp[n][m]
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# Seriation
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def seriate(words, ipas):
|
|
|
|
|
|
unused = set(words)
|
|
|
|
|
|
path = [words[0]]
|
|
|
|
|
|
unused.remove(words[0])
|
|
|
|
|
|
|
|
|
|
|
|
while unused:
|
|
|
|
|
|
cur = path[-1]
|
|
|
|
|
|
nxt = min(
|
2025-12-28 21:22:13 +01:00
|
|
|
|
unused,
|
2025-12-28 21:06:24 +01:00
|
|
|
|
key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
|
|
|
|
|
|
)
|
|
|
|
|
|
path.append(nxt)
|
|
|
|
|
|
unused.remove(nxt)
|
|
|
|
|
|
|
|
|
|
|
|
return path
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# Main
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
|
2025-12-28 21:22:13 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize_stdin():
|
|
|
|
|
|
"""
|
|
|
|
|
|
Reads stdin and returns a list of lowercase words.
|
|
|
|
|
|
Handles:
|
|
|
|
|
|
- Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
|
|
|
|
|
|
- Ignores punctuation
|
|
|
|
|
|
"""
|
|
|
|
|
|
text = sys.stdin.read()
|
|
|
|
|
|
tokens = text.translate(str.maketrans('', '', string.punctuation)).split()
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-12-28 21:06:24 +01:00
|
|
|
|
def main():
|
2025-12-28 21:22:13 +01:00
|
|
|
|
parser = argparse.ArgumentParser(description="Pun-sort words by phonetic similarity")
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
"--lang", "-l",
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
default="en",
|
|
|
|
|
|
help="Language code for espeak-ng (default: en)"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
LANG = args.lang
|
|
|
|
|
|
|
|
|
|
|
|
words = tokenize_stdin()
|
|
|
|
|
|
words = list(dict.fromkeys(words))
|
|
|
|
|
|
ipas = {w: tuple(ipa_tokenize(get_ipa(w, lang=LANG))) for w in words}
|
2025-12-28 21:06:24 +01:00
|
|
|
|
|
|
|
|
|
|
ordered = seriate(words, ipas)
|
|
|
|
|
|
|
|
|
|
|
|
for w in ordered:
|
|
|
|
|
|
print(f"{w}\t/{''.join(ipas[w])}/")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|