2025-12-28 21:58:59 +01:00
|
|
|
|
"""
|
|
|
|
|
|
FastAPI backend for phonetic word sorting
|
|
|
|
|
|
Sorts words by their phonetic similarity using espeak-ng IPA transcription
|
|
|
|
|
|
"""
|
|
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
|
|
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
|
|
from pydantic import BaseModel, Field
|
2025-12-28 22:12:51 +01:00
|
|
|
|
from typing import List
|
2025-12-28 21:58:59 +01:00
|
|
|
|
import string
|
|
|
|
|
|
import subprocess
|
|
|
|
|
|
from functools import lru_cache
|
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI(
|
|
|
|
|
|
title="Phonetic Word Sorter API",
|
|
|
|
|
|
description="Sort words by phonetic similarity using IPA transcription",
|
|
|
|
|
|
version="1.0.0"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# CORS middleware
|
|
|
|
|
|
app.add_middleware(
|
|
|
|
|
|
CORSMiddleware,
|
|
|
|
|
|
allow_origins=["*"],
|
|
|
|
|
|
allow_credentials=True,
|
|
|
|
|
|
allow_methods=["*"],
|
|
|
|
|
|
allow_headers=["*"],
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# Models
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
class SortRequest(BaseModel):
|
|
|
|
|
|
text: str = Field(..., description="Text containing words to sort")
|
|
|
|
|
|
lang: str = Field("en", description="Language code for espeak-ng (e.g., 'en', 'de', 'es')")
|
|
|
|
|
|
|
|
|
|
|
|
class Config:
|
|
|
|
|
|
schema_extra = {
|
|
|
|
|
|
"example": {
|
|
|
|
|
|
"text": "The quick brown fox jumps over the lazy dog",
|
|
|
|
|
|
"lang": "en"
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class WordIPA(BaseModel):
|
|
|
|
|
|
word: str
|
|
|
|
|
|
ipa: str
|
|
|
|
|
|
|
|
|
|
|
|
class SortResponse(BaseModel):
|
|
|
|
|
|
sorted_words: List[WordIPA]
|
|
|
|
|
|
|
|
|
|
|
|
class IPARequest(BaseModel):
|
|
|
|
|
|
word: str
|
|
|
|
|
|
lang: str = Field("en", description="Language code for espeak-ng")
|
|
|
|
|
|
|
|
|
|
|
|
class IPAResponse(BaseModel):
|
|
|
|
|
|
word: str
|
|
|
|
|
|
ipa: str
|
|
|
|
|
|
tokens: List[str]
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# IPA helpers
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
def get_ipa(word: str, lang: str = "en") -> str:
|
|
|
|
|
|
"""Get IPA transcription using espeak-ng"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
out = subprocess.check_output(
|
|
|
|
|
|
["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
|
|
|
|
|
|
stderr=subprocess.DEVNULL,
|
|
|
|
|
|
text=True,
|
|
|
|
|
|
timeout=5
|
|
|
|
|
|
)
|
|
|
|
|
|
return out.strip().strip("/")
|
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
|
|
raise HTTPException(status_code=504, detail="espeak-ng timeout")
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
|
status_code=500,
|
|
|
|
|
|
detail="espeak-ng not found. Please install it: apt-get install espeak-ng"
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
def ipa_tokenize(ipa: str) -> List[str]:
|
|
|
|
|
|
"""Tokenize IPA string into phonemes"""
|
|
|
|
|
|
tokens = []
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(ipa):
|
|
|
|
|
|
ch = ipa[i]
|
|
|
|
|
|
# Skip stress markers
|
|
|
|
|
|
if ch in "ˈˌ":
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
# Check for diphthongs
|
|
|
|
|
|
if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}:
|
|
|
|
|
|
tokens.append(ipa[i:i+2])
|
|
|
|
|
|
i += 2
|
|
|
|
|
|
else:
|
|
|
|
|
|
tokens.append(ch)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# Distance calculation
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
VOWELS = set("aeiouəɪʊɔɛɜɑæ")
|
|
|
|
|
|
|
|
|
|
|
|
def sub_cost(a: str, b: str) -> float:
|
|
|
|
|
|
"""Calculate substitution cost between two phonemes"""
|
|
|
|
|
|
if a == b:
|
|
|
|
|
|
return 0.0
|
|
|
|
|
|
if a in VOWELS and b in VOWELS:
|
|
|
|
|
|
return 0.6
|
|
|
|
|
|
if a in VOWELS or b in VOWELS:
|
|
|
|
|
|
return 2.0
|
|
|
|
|
|
return 1.0
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=None)
|
|
|
|
|
|
def phonetic_distance(a: tuple, b: tuple) -> float:
|
|
|
|
|
|
"""Calculate phonetic edit distance between two IPA token sequences"""
|
|
|
|
|
|
n, m = len(a), len(b)
|
|
|
|
|
|
dp = [[0] * (m + 1) for _ in range(n + 1)]
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(n + 1):
|
|
|
|
|
|
dp[i][0] = i
|
|
|
|
|
|
for j in range(m + 1):
|
|
|
|
|
|
dp[0][j] = j
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(1, n + 1):
|
|
|
|
|
|
for j in range(1, m + 1):
|
|
|
|
|
|
dp[i][j] = min(
|
|
|
|
|
|
dp[i - 1][j] + 1,
|
|
|
|
|
|
dp[i][j - 1] + 1,
|
|
|
|
|
|
dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1])
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return dp[n][m]
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize_text(text: str) -> List[str]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Tokenize text into words, removing punctuation.
|
|
|
|
|
|
Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
|
|
|
|
|
|
"""
|
|
|
|
|
|
# Remove punctuation and split into words
|
|
|
|
|
|
cleaned = text.translate(str.maketrans('', '', string.punctuation))
|
|
|
|
|
|
tokens = cleaned.split()
|
|
|
|
|
|
return [word.lower() for word in tokens]
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# Seriation algorithm
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
def seriate(words: List[str], ipas: dict) -> List[str]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Sort words by phonetic similarity using nearest-neighbor seriation
|
|
|
|
|
|
"""
|
|
|
|
|
|
if len(words) <= 1:
|
|
|
|
|
|
return words
|
|
|
|
|
|
|
|
|
|
|
|
unused = set(words)
|
|
|
|
|
|
path = [words[0]]
|
|
|
|
|
|
unused.remove(words[0])
|
|
|
|
|
|
|
|
|
|
|
|
while unused:
|
|
|
|
|
|
cur = path[-1]
|
|
|
|
|
|
nxt = min(
|
|
|
|
|
|
unused,
|
|
|
|
|
|
key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
|
|
|
|
|
|
)
|
|
|
|
|
|
path.append(nxt)
|
|
|
|
|
|
unused.remove(nxt)
|
|
|
|
|
|
|
|
|
|
|
|
return path
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
# API Endpoints
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
@app.get("/")
|
|
|
|
|
|
def root():
|
|
|
|
|
|
"""Root endpoint with API information"""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"name": "Phonetic Word Sorter API",
|
|
|
|
|
|
"version": "1.0.0",
|
|
|
|
|
|
"endpoints": {
|
|
|
|
|
|
"POST /sort": "Sort words by phonetic similarity",
|
|
|
|
|
|
"POST /ipa": "Get IPA transcription for a single word",
|
|
|
|
|
|
"GET /health": "Health check"
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/health")
|
|
|
|
|
|
def health_check():
|
|
|
|
|
|
"""Health check endpoint"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# Test espeak-ng availability
|
|
|
|
|
|
subprocess.run(
|
|
|
|
|
|
["espeak-ng", "--version"],
|
|
|
|
|
|
capture_output=True,
|
|
|
|
|
|
timeout=2
|
|
|
|
|
|
)
|
|
|
|
|
|
return {"status": "healthy", "espeak_ng": "available"}
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
return {"status": "unhealthy", "error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/ipa", response_model=IPAResponse)
|
|
|
|
|
|
def get_word_ipa(request: IPARequest):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Get IPA transcription and tokens for a single word
|
|
|
|
|
|
"""
|
|
|
|
|
|
ipa = get_ipa(request.word, request.lang)
|
|
|
|
|
|
if not ipa:
|
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
|
status_code=400,
|
|
|
|
|
|
detail=f"Could not get IPA for word '{request.word}'"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
tokens = ipa_tokenize(ipa)
|
|
|
|
|
|
|
|
|
|
|
|
return IPAResponse(
|
|
|
|
|
|
word=request.word,
|
|
|
|
|
|
ipa=ipa,
|
|
|
|
|
|
tokens=tokens
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/sort", response_model=SortResponse)
|
|
|
|
|
|
def sort_words(request: SortRequest):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Sort words from text by phonetic similarity
|
|
|
|
|
|
|
|
|
|
|
|
The algorithm:
|
|
|
|
|
|
1. Tokenizes input text into words
|
|
|
|
|
|
2. Gets IPA transcription for each word
|
|
|
|
|
|
3. Tokenizes IPA into phonemes
|
|
|
|
|
|
4. Uses nearest-neighbor seriation to order words by phonetic similarity
|
|
|
|
|
|
5. Returns ordered list with IPA transcriptions
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not request.text.strip():
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="No text provided")
|
|
|
|
|
|
|
|
|
|
|
|
# Tokenize text into words
|
|
|
|
|
|
words = tokenize_text(request.text)
|
|
|
|
|
|
|
|
|
|
|
|
if not words:
|
|
|
|
|
|
raise HTTPException(status_code=400, detail="No valid words found in text")
|
|
|
|
|
|
|
|
|
|
|
|
# Remove duplicates while preserving order
|
|
|
|
|
|
seen = set()
|
|
|
|
|
|
unique_words = []
|
|
|
|
|
|
for word in words:
|
|
|
|
|
|
if word not in seen:
|
|
|
|
|
|
seen.add(word)
|
|
|
|
|
|
unique_words.append(word)
|
|
|
|
|
|
|
|
|
|
|
|
# Get IPA for all words
|
|
|
|
|
|
ipas = {}
|
|
|
|
|
|
for word in unique_words:
|
|
|
|
|
|
ipa = get_ipa(word, request.lang)
|
|
|
|
|
|
if ipa:
|
|
|
|
|
|
ipas[word] = tuple(ipa_tokenize(ipa))
|
|
|
|
|
|
else:
|
|
|
|
|
|
# If IPA fails, use empty tuple
|
|
|
|
|
|
ipas[word] = tuple()
|
|
|
|
|
|
|
|
|
|
|
|
# Filter out words with no IPA
|
|
|
|
|
|
valid_words = [w for w in unique_words if ipas[w]]
|
|
|
|
|
|
|
|
|
|
|
|
if not valid_words:
|
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
|
status_code=400,
|
|
|
|
|
|
detail="Could not get IPA transcription for any words"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Sort by phonetic similarity
|
|
|
|
|
|
ordered = seriate(valid_words, ipas)
|
|
|
|
|
|
|
|
|
|
|
|
# Build response
|
|
|
|
|
|
sorted_words = [
|
|
|
|
|
|
WordIPA(word=w, ipa="".join(ipas[w]))
|
|
|
|
|
|
for w in ordered
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
return SortResponse(
|
|
|
|
|
|
sorted_words=sorted_words,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
import uvicorn
|
2025-12-28 22:15:24 +01:00
|
|
|
|
import os
|
|
|
|
|
|
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))
|