pun-sort: add api

This commit is contained in:
2025-12-28 21:58:59 +01:00
parent dfd03dd376
commit 950805bc9d

286
pun-sort/sort_api.py Executable file
View File

@@ -0,0 +1,286 @@
#!/usr/bin/env python3
"""
FastAPI backend for phonetic word sorting
Sorts words by their phonetic similarity using espeak-ng IPA transcription
"""
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from typing import List, Optional
import string
import subprocess
from functools import lru_cache
app = FastAPI(
title="Phonetic Word Sorter API",
description="Sort words by phonetic similarity using IPA transcription",
version="1.0.0"
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# -------------------------
# Models
# -------------------------
class SortRequest(BaseModel):
text: str = Field(..., description="Text containing words to sort")
lang: str = Field("en", description="Language code for espeak-ng (e.g., 'en', 'de', 'es')")
class Config:
schema_extra = {
"example": {
"text": "The quick brown fox jumps over the lazy dog",
"lang": "en"
}
}
class WordIPA(BaseModel):
word: str
ipa: str
class SortResponse(BaseModel):
sorted_words: List[WordIPA]
class IPARequest(BaseModel):
word: str
lang: str = Field("en", description="Language code for espeak-ng")
class IPAResponse(BaseModel):
word: str
ipa: str
tokens: List[str]
# -------------------------
# IPA helpers
# -------------------------
def get_ipa(word: str, lang: str = "en") -> str:
"""Get IPA transcription using espeak-ng"""
try:
out = subprocess.check_output(
["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
stderr=subprocess.DEVNULL,
text=True,
timeout=5
)
return out.strip().strip("/")
except subprocess.TimeoutExpired:
raise HTTPException(status_code=504, detail="espeak-ng timeout")
except FileNotFoundError:
raise HTTPException(
status_code=500,
detail="espeak-ng not found. Please install it: apt-get install espeak-ng"
)
except Exception as e:
return ""
def ipa_tokenize(ipa: str) -> List[str]:
"""Tokenize IPA string into phonemes"""
tokens = []
i = 0
while i < len(ipa):
ch = ipa[i]
# Skip stress markers
if ch in "ˈˌ":
i += 1
continue
# Check for diphthongs
if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "", "eɪ", "", "ɔɪ"}:
tokens.append(ipa[i:i+2])
i += 2
else:
tokens.append(ch)
i += 1
return tokens
# -------------------------
# Distance calculation
# -------------------------
VOWELS = set("aeiouəɪʊɔɛɜɑæ")
def sub_cost(a: str, b: str) -> float:
"""Calculate substitution cost between two phonemes"""
if a == b:
return 0.0
if a in VOWELS and b in VOWELS:
return 0.6
if a in VOWELS or b in VOWELS:
return 2.0
return 1.0
@lru_cache(maxsize=None)
def phonetic_distance(a: tuple, b: tuple) -> float:
"""Calculate phonetic edit distance between two IPA token sequences"""
n, m = len(a), len(b)
dp = [[0] * (m + 1) for _ in range(n + 1)]
for i in range(n + 1):
dp[i][0] = i
for j in range(m + 1):
dp[0][j] = j
for i in range(1, n + 1):
for j in range(1, m + 1):
dp[i][j] = min(
dp[i - 1][j] + 1,
dp[i][j - 1] + 1,
dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1])
)
return dp[n][m]
def tokenize_text(text: str) -> List[str]:
"""
Tokenize text into words, removing punctuation.
Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
"""
# Remove punctuation and split into words
cleaned = text.translate(str.maketrans('', '', string.punctuation))
tokens = cleaned.split()
return [word.lower() for word in tokens]
# -------------------------
# Seriation algorithm
# -------------------------
def seriate(words: List[str], ipas: dict) -> List[str]:
"""
Sort words by phonetic similarity using nearest-neighbor seriation
"""
if len(words) <= 1:
return words
unused = set(words)
path = [words[0]]
unused.remove(words[0])
while unused:
cur = path[-1]
nxt = min(
unused,
key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
)
path.append(nxt)
unused.remove(nxt)
return path
# -------------------------
# API Endpoints
# -------------------------
@app.get("/")
def root():
"""Root endpoint with API information"""
return {
"name": "Phonetic Word Sorter API",
"version": "1.0.0",
"endpoints": {
"POST /sort": "Sort words by phonetic similarity",
"POST /ipa": "Get IPA transcription for a single word",
"GET /health": "Health check"
}
}
@app.get("/health")
def health_check():
"""Health check endpoint"""
try:
# Test espeak-ng availability
subprocess.run(
["espeak-ng", "--version"],
capture_output=True,
timeout=2
)
return {"status": "healthy", "espeak_ng": "available"}
except Exception as e:
return {"status": "unhealthy", "error": str(e)}
@app.post("/ipa", response_model=IPAResponse)
def get_word_ipa(request: IPARequest):
"""
Get IPA transcription and tokens for a single word
"""
ipa = get_ipa(request.word, request.lang)
if not ipa:
raise HTTPException(
status_code=400,
detail=f"Could not get IPA for word '{request.word}'"
)
tokens = ipa_tokenize(ipa)
return IPAResponse(
word=request.word,
ipa=ipa,
tokens=tokens
)
@app.post("/sort", response_model=SortResponse)
def sort_words(request: SortRequest):
"""
Sort words from text by phonetic similarity
The algorithm:
1. Tokenizes input text into words
2. Gets IPA transcription for each word
3. Tokenizes IPA into phonemes
4. Uses nearest-neighbor seriation to order words by phonetic similarity
5. Returns ordered list with IPA transcriptions
"""
if not request.text.strip():
raise HTTPException(status_code=400, detail="No text provided")
# Tokenize text into words
words = tokenize_text(request.text)
if not words:
raise HTTPException(status_code=400, detail="No valid words found in text")
# Remove duplicates while preserving order
seen = set()
unique_words = []
for word in words:
if word not in seen:
seen.add(word)
unique_words.append(word)
# Get IPA for all words
ipas = {}
for word in unique_words:
ipa = get_ipa(word, request.lang)
if ipa:
ipas[word] = tuple(ipa_tokenize(ipa))
else:
# If IPA fails, use empty tuple
ipas[word] = tuple()
# Filter out words with no IPA
valid_words = [w for w in unique_words if ipas[w]]
if not valid_words:
raise HTTPException(
status_code=400,
detail="Could not get IPA transcription for any words"
)
# Sort by phonetic similarity
ordered = seriate(valid_words, ipas)
# Build response
sorted_words = [
WordIPA(word=w, ipa="".join(ipas[w]))
for w in ordered
]
return SortResponse(
sorted_words=sorted_words,
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)