to-hen/pun-sort/sort.py

#!/usr/bin/env python3

import sys
import string
import subprocess
from functools import lru_cache
import argparse

# -------------------------
# IPA helpers
# -------------------------

def get_ipa(word, lang="en"):
    try:
        out = subprocess.check_output(
            ["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
            stderr=subprocess.DEVNULL,
            text=True
        )
        return out.strip().strip("/")
    except Exception:
        return ""

def ipa_tokenize(ipa):
    tokens = []
    i = 0
    while i < len(ipa):
        ch = ipa[i]
        if ch in "ˈˌ":
            i += 1
            continue
        if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}:
            tokens.append(ipa[i:i+2])
            i += 2
        else:
            tokens.append(ch)
            i += 1
    return tokens

# -------------------------
# Distance
# -------------------------

VOWELS = set("aeiouəɪʊɔɛɜɑæ")

def sub_cost(a, b):
    if a == b:
        return 0.0
    if a in VOWELS and b in VOWELS:
        return 0.6
    if a in VOWELS or b in VOWELS:
        return 2.0
    return 1.0

@lru_cache(maxsize=None)
def phonetic_distance(a, b):
    a = tuple(a)
    b = tuple(b)
    n, m = len(a), len(b)
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            dp[i][j] = min(
                dp[i - 1][j] + 1,
                dp[i][j - 1] + 1,
                dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1])
            )

    return dp[n][m]

# -------------------------
# Seriation
# -------------------------

def seriate(words, ipas):
    unused = set(words)
    path = [words[0]]
    unused.remove(words[0])

    while unused:
        cur = path[-1]
        nxt = min(
            unused,
            key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
        )
        path.append(nxt)
        unused.remove(nxt)

    return path

# -------------------------
# Main
# -------------------------


def tokenize_stdin():
    """
    Reads stdin and returns a list of lowercase words.
    Handles:
      - Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
      - Ignores punctuation
    """
    text = sys.stdin.read()
    tokens = text.translate(str.maketrans('', '', string.punctuation)).split()
    return tokens


def main():
    parser = argparse.ArgumentParser(description="Pun-sort words by phonetic similarity")
    parser.add_argument(
        "--lang", "-l",
        type=str,
        default="en",
        help="Language code for espeak-ng (default: en)"
    )

    args = parser.parse_args()
    LANG = args.lang

    words = tokenize_stdin()
    words = list(dict.fromkeys(words))
    ipas = {w: tuple(ipa_tokenize(get_ipa(w, lang=LANG))) for w in words}

    ordered = seriate(words, ipas)

    for w in ordered:
        print(f"{w}\t/{''.join(ipas[w])}/")

if __name__ == "__main__":
    main()