Files
to-hen/pun-sort/sort_api.py

455 lines
13 KiB
Python
Raw Normal View History

2025-12-28 22:31:09 +01:00
#!/usr/bin/env python3
2025-12-28 21:58:59 +01:00
"""
FastAPI backend for phonetic word sorting
Sorts words by their phonetic similarity using espeak-ng IPA transcription
"""
2025-12-28 22:31:09 +01:00
from fastapi import FastAPI, HTTPException, Request
2025-12-28 21:58:59 +01:00
from fastapi.middleware.cors import CORSMiddleware
2025-12-28 22:31:09 +01:00
from fastapi.responses import HTMLResponse
from typing import List, Dict, Any
2025-12-28 21:58:59 +01:00
import string
import subprocess
from functools import lru_cache
app = FastAPI(
title="Phonetic Word Sorter API",
description="Sort words by phonetic similarity using IPA transcription",
version="1.0.0"
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# -------------------------
# IPA helpers
# -------------------------
def get_ipa(word: str, lang: str = "en") -> str:
"""Get IPA transcription using espeak-ng"""
try:
out = subprocess.check_output(
["espeak-ng", "-v", lang, "-q", "--ipa=3", word],
stderr=subprocess.DEVNULL,
text=True,
timeout=5
)
return out.strip().strip("/")
except subprocess.TimeoutExpired:
raise HTTPException(status_code=504, detail="espeak-ng timeout")
except FileNotFoundError:
raise HTTPException(
status_code=500,
detail="espeak-ng not found. Please install it: apt-get install espeak-ng"
)
2025-12-28 22:31:09 +01:00
except Exception:
2025-12-28 21:58:59 +01:00
return ""
def ipa_tokenize(ipa: str) -> List[str]:
"""Tokenize IPA string into phonemes"""
tokens = []
i = 0
while i < len(ipa):
ch = ipa[i]
if ch in "ˈˌ":
i += 1
continue
if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "", "eɪ", "", "ɔɪ"}:
tokens.append(ipa[i:i+2])
i += 2
else:
tokens.append(ch)
i += 1
return tokens
# -------------------------
# Distance calculation
# -------------------------
VOWELS = set("aeiouəɪʊɔɛɜɑæ")
def sub_cost(a: str, b: str) -> float:
"""Calculate substitution cost between two phonemes"""
if a == b:
return 0.0
if a in VOWELS and b in VOWELS:
return 0.6
if a in VOWELS or b in VOWELS:
return 2.0
return 1.0
@lru_cache(maxsize=None)
def phonetic_distance(a: tuple, b: tuple) -> float:
"""Calculate phonetic edit distance between two IPA token sequences"""
n, m = len(a), len(b)
dp = [[0] * (m + 1) for _ in range(n + 1)]
for i in range(n + 1):
dp[i][0] = i
for j in range(m + 1):
dp[0][j] = j
for i in range(1, n + 1):
for j in range(1, m + 1):
dp[i][j] = min(
dp[i - 1][j] + 1,
dp[i][j - 1] + 1,
dp[i - 1][j - 1] + sub_cost(a[i - 1], b[j - 1])
)
return dp[n][m]
def tokenize_text(text: str) -> List[str]:
"""
Tokenize text into words, removing punctuation.
Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
"""
cleaned = text.translate(str.maketrans('', '', string.punctuation))
tokens = cleaned.split()
2025-12-28 22:31:09 +01:00
return tokens
2025-12-28 21:58:59 +01:00
# -------------------------
# Seriation algorithm
# -------------------------
def seriate(words: List[str], ipas: dict) -> List[str]:
"""
Sort words by phonetic similarity using nearest-neighbor seriation
"""
if len(words) <= 1:
return words
unused = set(words)
path = [words[0]]
unused.remove(words[0])
while unused:
cur = path[-1]
nxt = min(
unused,
key=lambda w: phonetic_distance(ipas[cur], ipas[w]) / max(len(ipas[cur]), len(ipas[w]), 1)
)
path.append(nxt)
unused.remove(nxt)
return path
# -------------------------
# API Endpoints
# -------------------------
2025-12-28 22:31:09 +01:00
@app.get("/", response_class=HTMLResponse)
async def root():
"""Serve HTML interface"""
return """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Phonetic Word Sorter</title>
<style>
body {
font-family: Georgia, serif;
max-width: 650px;
margin: 40px auto;
padding: 0 20px;
line-height: 1.6;
color: #222;
}
h1 {
font-size: 1.8em;
margin-bottom: 0.3em;
font-weight: normal;
}
.subtitle {
color: #666;
margin-bottom: 2em;
font-style: italic;
}
label {
display: block;
margin-top: 1.5em;
margin-bottom: 0.3em;
}
textarea {
width: 100%;
padding: 8px;
border: 1px solid #ccc;
font-family: inherit;
font-size: 1em;
resize: vertical;
min-height: 100px;
}
select {
padding: 6px;
border: 1px solid #ccc;
font-family: inherit;
font-size: 1em;
}
button {
margin-top: 1em;
padding: 8px 16px;
border: 1px solid #333;
background: white;
cursor: pointer;
font-family: inherit;
font-size: 1em;
}
button:hover {
background: #f5f5f5;
}
button:disabled {
opacity: 0.5;
cursor: not-allowed;
}
#results {
margin-top: 2em;
padding-top: 2em;
border-top: 1px solid #ddd;
}
.result-header {
margin-bottom: 1em;
font-weight: normal;
}
.stats {
color: #666;
font-size: 0.9em;
margin-bottom: 1.5em;
}
.word-item {
padding: 0.5em 0;
border-bottom: 1px dotted #ddd;
}
.word {
font-weight: bold;
}
.ipa {
color: #666;
font-family: monospace;
margin-left: 1em;
}
.error {
color: #c00;
margin-top: 1em;
padding: 1em;
border-left: 3px solid #c00;
background: #fff5f5;
}
</style>
</head>
<body>
<h1>Phonetic Word Sorter</h1>
<p class="subtitle">Sort words by their phonetic similarity using IPA transcription</p>
<label for="text">Enter your text:</label>
<textarea id="text" placeholder="night knight kite kit bit bite byte">night knight kite kit bit bite byte</textarea>
<label for="lang">Language:</label>
<select id="lang">
<option value="en">English</option>
<option value="de">German</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
<option value="it">Italian</option>
<option value="pt">Portuguese</option>
<option value="nl">Dutch</option>
<option value="sv">Swedish</option>
<option value="no">Norwegian</option>
<option value="da">Danish</option>
</select>
<button id="sortBtn" onclick="sortWords()">Sort Words</button>
<div id="results"></div>
<script>
async function sortWords() {
const text = document.getElementById('text').value;
const lang = document.getElementById('lang').value;
const resultsDiv = document.getElementById('results');
const sortBtn = document.getElementById('sortBtn');
if (!text.trim()) {
resultsDiv.innerHTML = '<div class="error">Please enter some text</div>';
return;
}
sortBtn.disabled = true;
sortBtn.textContent = 'Sorting...';
resultsDiv.innerHTML = '<p>Processing...</p>';
try {
const response = await fetch('/sort', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ text, lang })
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'Request failed');
}
const data = await response.json();
let html = '<h2 class="result-header">Sorted Results</h2>';
html += `<div class="stats">${data.original_count} words (${data.unique_count} unique)</div>`;
data.sorted_words.forEach(item => {
html += `<div class="word-item"><span class="word">${item.word}</span><span class="ipa">/${item.ipa}/</span></div>`;
});
resultsDiv.innerHTML = html;
} catch (error) {
resultsDiv.innerHTML = `<div class="error">Error: ${error.message}</div>`;
} finally {
sortBtn.disabled = false;
sortBtn.textContent = 'Sort Words';
}
}
// Allow Enter key in textarea
document.getElementById('text').addEventListener('keydown', function(e) {
if (e.key === 'Enter' && e.ctrlKey) {
sortWords();
}
});
</script>
</body>
</html>
"""
@app.get("/api", response_class=HTMLResponse)
async def api_info():
"""API information endpoint"""
2025-12-28 21:58:59 +01:00
return {
"name": "Phonetic Word Sorter API",
"version": "1.0.0",
"endpoints": {
2025-12-28 22:31:09 +01:00
"GET /": "Web interface",
2025-12-28 21:58:59 +01:00
"POST /sort": "Sort words by phonetic similarity",
"POST /ipa": "Get IPA transcription for a single word",
"GET /health": "Health check"
}
}
@app.get("/health")
2025-12-28 22:31:09 +01:00
async def health_check():
2025-12-28 21:58:59 +01:00
"""Health check endpoint"""
try:
subprocess.run(
["espeak-ng", "--version"],
capture_output=True,
timeout=2
)
return {"status": "healthy", "espeak_ng": "available"}
except Exception as e:
return {"status": "unhealthy", "error": str(e)}
2025-12-28 22:31:09 +01:00
@app.post("/ipa")
async def get_word_ipa(request: Request):
2025-12-28 21:58:59 +01:00
"""
Get IPA transcription and tokens for a single word
2025-12-28 22:31:09 +01:00
Request body:
{
"word": "hello",
"lang": "en"
}
2025-12-28 21:58:59 +01:00
"""
2025-12-28 22:31:09 +01:00
data = await request.json()
word = data.get("word")
if not word:
raise HTTPException(status_code=400, detail="'word' field is required")
lang = data.get("lang", "en")
ipa = get_ipa(word, lang)
2025-12-28 21:58:59 +01:00
if not ipa:
raise HTTPException(
status_code=400,
2025-12-28 22:31:09 +01:00
detail=f"Could not get IPA for word '{word}'"
2025-12-28 21:58:59 +01:00
)
tokens = ipa_tokenize(ipa)
2025-12-28 22:31:09 +01:00
return {
"word": word,
"ipa": ipa,
"tokens": tokens
}
2025-12-28 21:58:59 +01:00
2025-12-28 22:31:09 +01:00
@app.post("/sort")
async def sort_words(request: Request):
2025-12-28 21:58:59 +01:00
"""
Sort words from text by phonetic similarity
2025-12-28 22:31:09 +01:00
Request body:
{
"text": "The quick brown fox jumps over the lazy dog",
"lang": "en"
}
2025-12-28 21:58:59 +01:00
"""
2025-12-28 22:31:09 +01:00
data = await request.json()
text = data.get("text")
if not text or not text.strip():
raise HTTPException(status_code=400, detail="'text' field is required")
2025-12-28 21:58:59 +01:00
2025-12-28 22:31:09 +01:00
lang = data.get("lang", "en")
words = tokenize_text(text)
2025-12-28 21:58:59 +01:00
if not words:
raise HTTPException(status_code=400, detail="No valid words found in text")
2025-12-28 22:31:09 +01:00
original_count = len(words)
2025-12-28 21:58:59 +01:00
seen = set()
unique_words = []
for word in words:
if word not in seen:
seen.add(word)
unique_words.append(word)
ipas = {}
for word in unique_words:
2025-12-28 22:31:09 +01:00
ipa = get_ipa(word, lang)
2025-12-28 21:58:59 +01:00
if ipa:
ipas[word] = tuple(ipa_tokenize(ipa))
else:
ipas[word] = tuple()
valid_words = [w for w in unique_words if ipas[w]]
if not valid_words:
raise HTTPException(
status_code=400,
detail="Could not get IPA transcription for any words"
)
ordered = seriate(valid_words, ipas)
sorted_words = [
2025-12-28 22:31:09 +01:00
{"word": w, "ipa": "".join(ipas[w])}
2025-12-28 21:58:59 +01:00
for w in ordered
]
2025-12-28 22:31:09 +01:00
return {
"sorted_words": sorted_words,
"original_count": original_count,
"unique_count": len(unique_words)
}
2025-12-28 21:58:59 +01:00
if __name__ == "__main__":
import uvicorn
2025-12-28 22:15:24 +01:00
import os
2025-12-28 22:31:09 +01:00
port = int(os.environ.get("PORT", 8000))
uvicorn.run(app, host="0.0.0.0", port=port)