#!/usr/bin/env python3 """ FastAPI backend for phonetic word sorting Sorts words by their phonetic similarity using espeak-ng IPA transcription """ from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse from typing import List, Dict, Any import string import subprocess from functools import lru_cache app = FastAPI( title="Phonetic Word Sorter API", description="Sort words by phonetic similarity using IPA transcription", version="1.0.0" ) # CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ------------------------- # IPA helpers # ------------------------- def get_ipa(word: str, lang: str = "en") -> str: """Get IPA transcription using espeak-ng""" try: out = subprocess.check_output( ["espeak-ng", "-v", lang, "-q", "--ipa=3", word], stderr=subprocess.DEVNULL, text=True, timeout=5 ) return out.strip().strip("/") except subprocess.TimeoutExpired: raise HTTPException(status_code=504, detail="espeak-ng timeout") except FileNotFoundError: raise HTTPException( status_code=500, detail="espeak-ng not found. Please install it: apt-get install espeak-ng" ) except Exception: return "" def ipa_tokenize(ipa: str) -> List[str]: """Tokenize IPA string into phonemes""" tokens = [] i = 0 while i < len(ipa): ch = ipa[i] if ch in "ˈˌ": i += 1 continue if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}: tokens.append(ipa[i:i+2]) i += 2 else: tokens.append(ch) i += 1 return tokens # ------------------------- # Distance calculation # ------------------------- VOWELS = set("aeiouəɪʊɔɛɜɑæ") def sub_cost(a: str, b: str) -> float: """Calculate substitution cost between two phonemes""" if a == b: return 0.0 if a in VOWELS and b in VOWELS: return 0.6 if a in VOWELS or b in VOWELS: return 2.0 return 1.0 @lru_cache(maxsize=None) def phonetic_distance(a: tuple, b: tuple) -> float: """Calculate phonetic edit distance between two IPA token sequences""" n, m = len(a), len(b) dp = [[0] * (m + 1) for _ in range(n + 1)] for i in range(n + 1): dp[i][0] = i for j in range(m + 1): dp[0][j] = j for i in range(1, n + 1): for j in range(1, m + 1): dp[i][j] = min( dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1]) ) return dp[n][m] def tokenize_text(text: str) -> List[str]: """ Tokenize text into words, removing punctuation. Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.) """ cleaned = text.translate(str.maketrans('', '', string.punctuation)) tokens = cleaned.split() return tokens # ------------------------- # Seriation algorithm # ------------------------- def seriate(words: List[str], ipas: dict) -> List[str]: """ Sort words by phonetic similarity using nearest-neighbor seriation """ if len(words) <= 1: return words unused = set(words) path = [words[0]] unused.remove(words[0]) while unused: cur = path[-1] nxt = min( unused, key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1) ) path.append(nxt) unused.remove(nxt) return path # ------------------------- # API Endpoints # ------------------------- @app.get("/", response_class=HTMLResponse) async def root(): """Serve HTML interface""" return """
Sort words by their phonetic similarity using IPA transcription
""" @app.get("/api", response_class=HTMLResponse) async def api_info(): """API information endpoint""" return { "name": "Phonetic Word Sorter API", "version": "1.0.0", "endpoints": { "GET /": "Web interface", "POST /sort": "Sort words by phonetic similarity", "POST /ipa": "Get IPA transcription for a single word", "GET /health": "Health check" } } @app.get("/health") async def health_check(): """Health check endpoint""" try: subprocess.run( ["espeak-ng", "--version"], capture_output=True, timeout=2 ) return {"status": "healthy", "espeak_ng": "available"} except Exception as e: return {"status": "unhealthy", "error": str(e)} @app.post("/ipa") async def get_word_ipa(request: Request): """ Get IPA transcription and tokens for a single word Request body: { "word": "hello", "lang": "en" } """ data = await request.json() word = data.get("word") if not word: raise HTTPException(status_code=400, detail="'word' field is required") lang = data.get("lang", "en") ipa = get_ipa(word, lang) if not ipa: raise HTTPException( status_code=400, detail=f"Could not get IPA for word '{word}'" ) tokens = ipa_tokenize(ipa) return { "word": word, "ipa": ipa, "tokens": tokens } @app.post("/sort") async def sort_words(request: Request): """ Sort words from text by phonetic similarity Request body: { "text": "The quick brown fox jumps over the lazy dog", "lang": "en" } """ data = await request.json() text = data.get("text") if not text or not text.strip(): raise HTTPException(status_code=400, detail="'text' field is required") lang = data.get("lang", "en") words = tokenize_text(text) if not words: raise HTTPException(status_code=400, detail="No valid words found in text") original_count = len(words) seen = set() unique_words = [] for word in words: if word not in seen: seen.add(word) unique_words.append(word) ipas = {} for word in unique_words: ipa = get_ipa(word, lang) if ipa: ipas[word] = tuple(ipa_tokenize(ipa)) else: ipas[word] = tuple() valid_words = [w for w in unique_words if ipas[w]] if not valid_words: raise HTTPException( status_code=400, detail="Could not get IPA transcription for any words" ) ordered = seriate(valid_words, ipas) sorted_words = [ {"word": w, "ipa": "".join(ipas[w])} for w in ordered ] return { "sorted_words": sorted_words, "original_count": original_count, "unique_count": len(unique_words) } if __name__ == "__main__": import uvicorn import os port = int(os.environ.get("PORT", 8000)) uvicorn.run(app, host="0.0.0.0", port=port)