#!/usr/bin/env python3 """ FastAPI backend for phonetic word sorting Sorts words by their phonetic similarity using espeak-ng IPA transcription """ from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field from typing import List, Optional import string import subprocess from functools import lru_cache app = FastAPI( title="Phonetic Word Sorter API", description="Sort words by phonetic similarity using IPA transcription", version="1.0.0" ) # CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ------------------------- # Models # ------------------------- class SortRequest(BaseModel): text: str = Field(..., description="Text containing words to sort") lang: str = Field("en", description="Language code for espeak-ng (e.g., 'en', 'de', 'es')") class Config: schema_extra = { "example": { "text": "The quick brown fox jumps over the lazy dog", "lang": "en" } } class WordIPA(BaseModel): word: str ipa: str class SortResponse(BaseModel): sorted_words: List[WordIPA] class IPARequest(BaseModel): word: str lang: str = Field("en", description="Language code for espeak-ng") class IPAResponse(BaseModel): word: str ipa: str tokens: List[str] # ------------------------- # IPA helpers # ------------------------- def get_ipa(word: str, lang: str = "en") -> str: """Get IPA transcription using espeak-ng""" try: out = subprocess.check_output( ["espeak-ng", "-v", lang, "-q", "--ipa=3", word], stderr=subprocess.DEVNULL, text=True, timeout=5 ) return out.strip().strip("/") except subprocess.TimeoutExpired: raise HTTPException(status_code=504, detail="espeak-ng timeout") except FileNotFoundError: raise HTTPException( status_code=500, detail="espeak-ng not found. Please install it: apt-get install espeak-ng" ) except Exception as e: return "" def ipa_tokenize(ipa: str) -> List[str]: """Tokenize IPA string into phonemes""" tokens = [] i = 0 while i < len(ipa): ch = ipa[i] # Skip stress markers if ch in "ˈˌ": i += 1 continue # Check for diphthongs if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}: tokens.append(ipa[i:i+2]) i += 2 else: tokens.append(ch) i += 1 return tokens # ------------------------- # Distance calculation # ------------------------- VOWELS = set("aeiouəɪʊɔɛɜɑæ") def sub_cost(a: str, b: str) -> float: """Calculate substitution cost between two phonemes""" if a == b: return 0.0 if a in VOWELS and b in VOWELS: return 0.6 if a in VOWELS or b in VOWELS: return 2.0 return 1.0 @lru_cache(maxsize=None) def phonetic_distance(a: tuple, b: tuple) -> float: """Calculate phonetic edit distance between two IPA token sequences""" n, m = len(a), len(b) dp = [[0] * (m + 1) for _ in range(n + 1)] for i in range(n + 1): dp[i][0] = i for j in range(m + 1): dp[0][j] = j for i in range(1, n + 1): for j in range(1, m + 1): dp[i][j] = min( dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1]) ) return dp[n][m] def tokenize_text(text: str) -> List[str]: """ Tokenize text into words, removing punctuation. Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.) """ # Remove punctuation and split into words cleaned = text.translate(str.maketrans('', '', string.punctuation)) tokens = cleaned.split() return [word.lower() for word in tokens] # ------------------------- # Seriation algorithm # ------------------------- def seriate(words: List[str], ipas: dict) -> List[str]: """ Sort words by phonetic similarity using nearest-neighbor seriation """ if len(words) <= 1: return words unused = set(words) path = [words[0]] unused.remove(words[0]) while unused: cur = path[-1] nxt = min( unused, key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1) ) path.append(nxt) unused.remove(nxt) return path # ------------------------- # API Endpoints # ------------------------- @app.get("/") def root(): """Root endpoint with API information""" return { "name": "Phonetic Word Sorter API", "version": "1.0.0", "endpoints": { "POST /sort": "Sort words by phonetic similarity", "POST /ipa": "Get IPA transcription for a single word", "GET /health": "Health check" } } @app.get("/health") def health_check(): """Health check endpoint""" try: # Test espeak-ng availability subprocess.run( ["espeak-ng", "--version"], capture_output=True, timeout=2 ) return {"status": "healthy", "espeak_ng": "available"} except Exception as e: return {"status": "unhealthy", "error": str(e)} @app.post("/ipa", response_model=IPAResponse) def get_word_ipa(request: IPARequest): """ Get IPA transcription and tokens for a single word """ ipa = get_ipa(request.word, request.lang) if not ipa: raise HTTPException( status_code=400, detail=f"Could not get IPA for word '{request.word}'" ) tokens = ipa_tokenize(ipa) return IPAResponse( word=request.word, ipa=ipa, tokens=tokens ) @app.post("/sort", response_model=SortResponse) def sort_words(request: SortRequest): """ Sort words from text by phonetic similarity The algorithm: 1. Tokenizes input text into words 2. Gets IPA transcription for each word 3. Tokenizes IPA into phonemes 4. Uses nearest-neighbor seriation to order words by phonetic similarity 5. Returns ordered list with IPA transcriptions """ if not request.text.strip(): raise HTTPException(status_code=400, detail="No text provided") # Tokenize text into words words = tokenize_text(request.text) if not words: raise HTTPException(status_code=400, detail="No valid words found in text") # Remove duplicates while preserving order seen = set() unique_words = [] for word in words: if word not in seen: seen.add(word) unique_words.append(word) # Get IPA for all words ipas = {} for word in unique_words: ipa = get_ipa(word, request.lang) if ipa: ipas[word] = tuple(ipa_tokenize(ipa)) else: # If IPA fails, use empty tuple ipas[word] = tuple() # Filter out words with no IPA valid_words = [w for w in unique_words if ipas[w]] if not valid_words: raise HTTPException( status_code=400, detail="Could not get IPA transcription for any words" ) # Sort by phonetic similarity ordered = seriate(valid_words, ipas) # Build response sorted_words = [ WordIPA(word=w, ipa="".join(ipas[w])) for w in ordered ] return SortResponse( sorted_words=sorted_words, ) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)