pun-sort: add api

2025-12-28 21:58:59 +01:00
parent dfd03dd376
commit 950805bc9d
1 changed files with 286 additions and 0 deletions
--- a/pun-sort/sort_api.py
+++ b/pun-sort/sort_api.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+"""
+FastAPI backend for phonetic word sorting
+Sorts words by their phonetic similarity using espeak-ng IPA transcription
+"""
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from typing import List, Optional
+import string
+import subprocess
+from functools import lru_cache
+
+app = FastAPI(
+    title="Phonetic Word Sorter API",
+    description="Sort words by phonetic similarity using IPA transcription",
+    version="1.0.0"
+)
+
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# -------------------------
+# Models
+# -------------------------
+class SortRequest(BaseModel):
+    text: str = Field(..., description="Text containing words to sort")
+    lang: str = Field("en", description="Language code for espeak-ng (e.g., 'en', 'de', 'es')")
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "text": "The quick brown fox jumps over the lazy dog",
+                "lang": "en"
+            }
+        }
+
+class WordIPA(BaseModel):
+    word: str
+    ipa: str
+
+class SortResponse(BaseModel):
+    sorted_words: List[WordIPA]
+
+class IPARequest(BaseModel):
+    word: str
+    lang: str = Field("en", description="Language code for espeak-ng")
+
+class IPAResponse(BaseModel):
+    word: str
+    ipa: str
+    tokens: List[str]
+
+# -------------------------
+# IPA helpers
+# -------------------------
+def get_ipa(word: str, lang: str = "en") -> str:
+    """Get IPA transcription using espeak-ng"""
+    try:
+        out = subprocess.check_output(
+            ["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
+            stderr=subprocess.DEVNULL,
+            text=True,
+            timeout=5
+        )
+        return out.strip().strip("/")
+    except subprocess.TimeoutExpired:
+        raise HTTPException(status_code=504, detail="espeak-ng timeout")
+    except FileNotFoundError:
+        raise HTTPException(
+            status_code=500,
+            detail="espeak-ng not found. Please install it: apt-get install espeak-ng"
+        )
+    except Exception as e:
+        return ""
+
+def ipa_tokenize(ipa: str) -> List[str]:
+    """Tokenize IPA string into phonemes"""
+    tokens = []
+    i = 0
+    while i < len(ipa):
+        ch = ipa[i]
+        # Skip stress markers
+        if ch in "ˈˌ":
+            i += 1
+            continue
+        # Check for diphthongs
+        if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}:
+            tokens.append(ipa[i:i+2])
+            i += 2
+        else:
+            tokens.append(ch)
+            i += 1
+    return tokens
+
+# -------------------------
+# Distance calculation
+# -------------------------
+VOWELS = set("aeiouəɪʊɔɛɜɑæ")
+
+def sub_cost(a: str, b: str) -> float:
+    """Calculate substitution cost between two phonemes"""
+    if a == b:
+        return 0.0
+    if a in VOWELS and b in VOWELS:
+        return 0.6
+    if a in VOWELS or b in VOWELS:
+        return 2.0
+    return 1.0
+
+@lru_cache(maxsize=None)
+def phonetic_distance(a: tuple, b: tuple) -> float:
+    """Calculate phonetic edit distance between two IPA token sequences"""
+    n, m = len(a), len(b)
+    dp = [[0] * (m + 1) for _ in range(n + 1)]
+
+    for i in range(n + 1):
+        dp[i][0] = i
+    for j in range(m + 1):
+        dp[0][j] = j
+
+    for i in range(1, n + 1):
+        for j in range(1, m + 1):
+            dp[i][j] = min(
+                dp[i - 1][j] + 1,
+                dp[i][j - 1] + 1,
+                dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1])
+            )
+
+    return dp[n][m]
+
+def tokenize_text(text: str) -> List[str]:
+    """
+    Tokenize text into words, removing punctuation.
+    Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
+    """
+    # Remove punctuation and split into words
+    cleaned = text.translate(str.maketrans('', '', string.punctuation))
+    tokens = cleaned.split()
+    return [word.lower() for word in tokens]
+
+# -------------------------
+# Seriation algorithm
+# -------------------------
+def seriate(words: List[str], ipas: dict) -> List[str]:
+    """
+    Sort words by phonetic similarity using nearest-neighbor seriation
+    """
+    if len(words) <= 1:
+        return words
+
+    unused = set(words)
+    path = [words[0]]
+    unused.remove(words[0])
+
+    while unused:
+        cur = path[-1]
+        nxt = min(
+            unused,
+            key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
+        )
+        path.append(nxt)
+        unused.remove(nxt)
+
+    return path
+
+# -------------------------
+# API Endpoints
+# -------------------------
+@app.get("/")
+def root():
+    """Root endpoint with API information"""
+    return {
+        "name": "Phonetic Word Sorter API",
+        "version": "1.0.0",
+        "endpoints": {
+            "POST /sort": "Sort words by phonetic similarity",
+            "POST /ipa": "Get IPA transcription for a single word",
+            "GET /health": "Health check"
+        }
+    }
+
+@app.get("/health")
+def health_check():
+    """Health check endpoint"""
+    try:
+        # Test espeak-ng availability
+        subprocess.run(
+            ["espeak-ng", "--version"],
+            capture_output=True,
+            timeout=2
+        )
+        return {"status": "healthy", "espeak_ng": "available"}
+    except Exception as e:
+        return {"status": "unhealthy", "error": str(e)}
+
+@app.post("/ipa", response_model=IPAResponse)
+def get_word_ipa(request: IPARequest):
+    """
+    Get IPA transcription and tokens for a single word
+    """
+    ipa = get_ipa(request.word, request.lang)
+    if not ipa:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Could not get IPA for word '{request.word}'"
+        )
+
+    tokens = ipa_tokenize(ipa)
+
+    return IPAResponse(
+        word=request.word,
+        ipa=ipa,
+        tokens=tokens
+    )
+
+@app.post("/sort", response_model=SortResponse)
+def sort_words(request: SortRequest):
+    """
+    Sort words from text by phonetic similarity
+
+    The algorithm:
+    1. Tokenizes input text into words
+    2. Gets IPA transcription for each word
+    3. Tokenizes IPA into phonemes
+    4. Uses nearest-neighbor seriation to order words by phonetic similarity
+    5. Returns ordered list with IPA transcriptions
+    """
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="No text provided")
+
+    # Tokenize text into words
+    words = tokenize_text(request.text)
+
+    if not words:
+        raise HTTPException(status_code=400, detail="No valid words found in text")
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_words = []
+    for word in words:
+        if word not in seen:
+            seen.add(word)
+            unique_words.append(word)
+
+    # Get IPA for all words
+    ipas = {}
+    for word in unique_words:
+        ipa = get_ipa(word, request.lang)
+        if ipa:
+            ipas[word] = tuple(ipa_tokenize(ipa))
+        else:
+            # If IPA fails, use empty tuple
+            ipas[word] = tuple()
+
+    # Filter out words with no IPA
+    valid_words = [w for w in unique_words if ipas[w]]
+
+    if not valid_words:
+        raise HTTPException(
+            status_code=400,
+            detail="Could not get IPA transcription for any words"
+        )
+
+    # Sort by phonetic similarity
+    ordered = seriate(valid_words, ipas)
+
+    # Build response
+    sorted_words = [
+        WordIPA(word=w, ipa="".join(ipas[w]))
+        for w in ordered
+    ]
+
+    return SortResponse(
+        sorted_words=sorted_words,
+    )
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)