pun-sort: add frontend
This commit is contained in:
@@ -4,9 +4,11 @@ writers.writePython3Bin "pun_sort_api.py" {
|
|||||||
"E203"
|
"E203"
|
||||||
"E203"
|
"E203"
|
||||||
"E226"
|
"E226"
|
||||||
|
"E265"
|
||||||
"E302"
|
"E302"
|
||||||
"E305"
|
"E305"
|
||||||
"E501"
|
"E501"
|
||||||
|
"F401"
|
||||||
"F841"
|
"F841"
|
||||||
"W503"
|
"W503"
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
FastAPI backend for phonetic word sorting
|
FastAPI backend for phonetic word sorting
|
||||||
Sorts words by their phonetic similarity using espeak-ng IPA transcription
|
Sorts words by their phonetic similarity using espeak-ng IPA transcription
|
||||||
"""
|
"""
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException, Request
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from pydantic import BaseModel, Field
|
from fastapi.responses import HTMLResponse
|
||||||
from typing import List
|
from typing import List, Dict, Any
|
||||||
import string
|
import string
|
||||||
import subprocess
|
import subprocess
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
@@ -25,37 +26,6 @@ app.add_middleware(
|
|||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# -------------------------
|
|
||||||
# Models
|
|
||||||
# -------------------------
|
|
||||||
class SortRequest(BaseModel):
|
|
||||||
text: str = Field(..., description="Text containing words to sort")
|
|
||||||
lang: str = Field("en", description="Language code for espeak-ng (e.g., 'en', 'de', 'es')")
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
schema_extra = {
|
|
||||||
"example": {
|
|
||||||
"text": "The quick brown fox jumps over the lazy dog",
|
|
||||||
"lang": "en"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class WordIPA(BaseModel):
|
|
||||||
word: str
|
|
||||||
ipa: str
|
|
||||||
|
|
||||||
class SortResponse(BaseModel):
|
|
||||||
sorted_words: List[WordIPA]
|
|
||||||
|
|
||||||
class IPARequest(BaseModel):
|
|
||||||
word: str
|
|
||||||
lang: str = Field("en", description="Language code for espeak-ng")
|
|
||||||
|
|
||||||
class IPAResponse(BaseModel):
|
|
||||||
word: str
|
|
||||||
ipa: str
|
|
||||||
tokens: List[str]
|
|
||||||
|
|
||||||
# -------------------------
|
# -------------------------
|
||||||
# IPA helpers
|
# IPA helpers
|
||||||
# -------------------------
|
# -------------------------
|
||||||
@@ -76,7 +46,7 @@ def get_ipa(word: str, lang: str = "en") -> str:
|
|||||||
status_code=500,
|
status_code=500,
|
||||||
detail="espeak-ng not found. Please install it: apt-get install espeak-ng"
|
detail="espeak-ng not found. Please install it: apt-get install espeak-ng"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def ipa_tokenize(ipa: str) -> List[str]:
|
def ipa_tokenize(ipa: str) -> List[str]:
|
||||||
@@ -85,11 +55,9 @@ def ipa_tokenize(ipa: str) -> List[str]:
|
|||||||
i = 0
|
i = 0
|
||||||
while i < len(ipa):
|
while i < len(ipa):
|
||||||
ch = ipa[i]
|
ch = ipa[i]
|
||||||
# Skip stress markers
|
|
||||||
if ch in "ˈˌ":
|
if ch in "ˈˌ":
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
# Check for diphthongs
|
|
||||||
if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}:
|
if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}:
|
||||||
tokens.append(ipa[i:i+2])
|
tokens.append(ipa[i:i+2])
|
||||||
i += 2
|
i += 2
|
||||||
@@ -139,10 +107,9 @@ def tokenize_text(text: str) -> List[str]:
|
|||||||
Tokenize text into words, removing punctuation.
|
Tokenize text into words, removing punctuation.
|
||||||
Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
|
Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.)
|
||||||
"""
|
"""
|
||||||
# Remove punctuation and split into words
|
|
||||||
cleaned = text.translate(str.maketrans('', '', string.punctuation))
|
cleaned = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
tokens = cleaned.split()
|
tokens = cleaned.split()
|
||||||
return [word.lower() for word in tokens]
|
return tokens
|
||||||
|
|
||||||
# -------------------------
|
# -------------------------
|
||||||
# Seriation algorithm
|
# Seriation algorithm
|
||||||
@@ -172,13 +139,198 @@ def seriate(words: List[str], ipas: dict) -> List[str]:
|
|||||||
# -------------------------
|
# -------------------------
|
||||||
# API Endpoints
|
# API Endpoints
|
||||||
# -------------------------
|
# -------------------------
|
||||||
@app.get("/")
|
@app.get("/", response_class=HTMLResponse)
|
||||||
def root():
|
async def root():
|
||||||
"""Root endpoint with API information"""
|
"""Serve HTML interface"""
|
||||||
|
return """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Phonetic Word Sorter</title>
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
font-family: Georgia, serif;
|
||||||
|
max-width: 650px;
|
||||||
|
margin: 40px auto;
|
||||||
|
padding: 0 20px;
|
||||||
|
line-height: 1.6;
|
||||||
|
color: #222;
|
||||||
|
}
|
||||||
|
h1 {
|
||||||
|
font-size: 1.8em;
|
||||||
|
margin-bottom: 0.3em;
|
||||||
|
font-weight: normal;
|
||||||
|
}
|
||||||
|
.subtitle {
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 2em;
|
||||||
|
font-style: italic;
|
||||||
|
}
|
||||||
|
label {
|
||||||
|
display: block;
|
||||||
|
margin-top: 1.5em;
|
||||||
|
margin-bottom: 0.3em;
|
||||||
|
}
|
||||||
|
textarea {
|
||||||
|
width: 100%;
|
||||||
|
padding: 8px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
font-family: inherit;
|
||||||
|
font-size: 1em;
|
||||||
|
resize: vertical;
|
||||||
|
min-height: 100px;
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
padding: 6px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
font-family: inherit;
|
||||||
|
font-size: 1em;
|
||||||
|
}
|
||||||
|
button {
|
||||||
|
margin-top: 1em;
|
||||||
|
padding: 8px 16px;
|
||||||
|
border: 1px solid #333;
|
||||||
|
background: white;
|
||||||
|
cursor: pointer;
|
||||||
|
font-family: inherit;
|
||||||
|
font-size: 1em;
|
||||||
|
}
|
||||||
|
button:hover {
|
||||||
|
background: #f5f5f5;
|
||||||
|
}
|
||||||
|
button:disabled {
|
||||||
|
opacity: 0.5;
|
||||||
|
cursor: not-allowed;
|
||||||
|
}
|
||||||
|
#results {
|
||||||
|
margin-top: 2em;
|
||||||
|
padding-top: 2em;
|
||||||
|
border-top: 1px solid #ddd;
|
||||||
|
}
|
||||||
|
.result-header {
|
||||||
|
margin-bottom: 1em;
|
||||||
|
font-weight: normal;
|
||||||
|
}
|
||||||
|
.stats {
|
||||||
|
color: #666;
|
||||||
|
font-size: 0.9em;
|
||||||
|
margin-bottom: 1.5em;
|
||||||
|
}
|
||||||
|
.word-item {
|
||||||
|
padding: 0.5em 0;
|
||||||
|
border-bottom: 1px dotted #ddd;
|
||||||
|
}
|
||||||
|
.word {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
.ipa {
|
||||||
|
color: #666;
|
||||||
|
font-family: monospace;
|
||||||
|
margin-left: 1em;
|
||||||
|
}
|
||||||
|
.error {
|
||||||
|
color: #c00;
|
||||||
|
margin-top: 1em;
|
||||||
|
padding: 1em;
|
||||||
|
border-left: 3px solid #c00;
|
||||||
|
background: #fff5f5;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Phonetic Word Sorter</h1>
|
||||||
|
<p class="subtitle">Sort words by their phonetic similarity using IPA transcription</p>
|
||||||
|
|
||||||
|
<label for="text">Enter your text:</label>
|
||||||
|
<textarea id="text" placeholder="night knight kite kit bit bite byte">night knight kite kit bit bite byte</textarea>
|
||||||
|
|
||||||
|
<label for="lang">Language:</label>
|
||||||
|
<select id="lang">
|
||||||
|
<option value="en">English</option>
|
||||||
|
<option value="de">German</option>
|
||||||
|
<option value="es">Spanish</option>
|
||||||
|
<option value="fr">French</option>
|
||||||
|
<option value="it">Italian</option>
|
||||||
|
<option value="pt">Portuguese</option>
|
||||||
|
<option value="nl">Dutch</option>
|
||||||
|
<option value="sv">Swedish</option>
|
||||||
|
<option value="no">Norwegian</option>
|
||||||
|
<option value="da">Danish</option>
|
||||||
|
</select>
|
||||||
|
|
||||||
|
<button id="sortBtn" onclick="sortWords()">Sort Words</button>
|
||||||
|
|
||||||
|
<div id="results"></div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
async function sortWords() {
|
||||||
|
const text = document.getElementById('text').value;
|
||||||
|
const lang = document.getElementById('lang').value;
|
||||||
|
const resultsDiv = document.getElementById('results');
|
||||||
|
const sortBtn = document.getElementById('sortBtn');
|
||||||
|
|
||||||
|
if (!text.trim()) {
|
||||||
|
resultsDiv.innerHTML = '<div class="error">Please enter some text</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
sortBtn.disabled = true;
|
||||||
|
sortBtn.textContent = 'Sorting...';
|
||||||
|
resultsDiv.innerHTML = '<p>Processing...</p>';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch('/sort', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({ text, lang })
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = await response.json();
|
||||||
|
throw new Error(error.detail || 'Request failed');
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await response.json();
|
||||||
|
|
||||||
|
let html = '<h2 class="result-header">Sorted Results</h2>';
|
||||||
|
html += `<div class="stats">${data.original_count} words (${data.unique_count} unique)</div>`;
|
||||||
|
|
||||||
|
data.sorted_words.forEach(item => {
|
||||||
|
html += `<div class="word-item"><span class="word">${item.word}</span><span class="ipa">/${item.ipa}/</span></div>`;
|
||||||
|
});
|
||||||
|
|
||||||
|
resultsDiv.innerHTML = html;
|
||||||
|
} catch (error) {
|
||||||
|
resultsDiv.innerHTML = `<div class="error">Error: ${error.message}</div>`;
|
||||||
|
} finally {
|
||||||
|
sortBtn.disabled = false;
|
||||||
|
sortBtn.textContent = 'Sort Words';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allow Enter key in textarea
|
||||||
|
document.getElementById('text').addEventListener('keydown', function(e) {
|
||||||
|
if (e.key === 'Enter' && e.ctrlKey) {
|
||||||
|
sortWords();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@app.get("/api", response_class=HTMLResponse)
|
||||||
|
async def api_info():
|
||||||
|
"""API information endpoint"""
|
||||||
return {
|
return {
|
||||||
"name": "Phonetic Word Sorter API",
|
"name": "Phonetic Word Sorter API",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"endpoints": {
|
"endpoints": {
|
||||||
|
"GET /": "Web interface",
|
||||||
"POST /sort": "Sort words by phonetic similarity",
|
"POST /sort": "Sort words by phonetic similarity",
|
||||||
"POST /ipa": "Get IPA transcription for a single word",
|
"POST /ipa": "Get IPA transcription for a single word",
|
||||||
"GET /health": "Health check"
|
"GET /health": "Health check"
|
||||||
@@ -186,10 +338,9 @@ def root():
|
|||||||
}
|
}
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
def health_check():
|
async def health_check():
|
||||||
"""Health check endpoint"""
|
"""Health check endpoint"""
|
||||||
try:
|
try:
|
||||||
# Test espeak-ng availability
|
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
["espeak-ng", "--version"],
|
["espeak-ng", "--version"],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
@@ -199,48 +350,66 @@ def health_check():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "unhealthy", "error": str(e)}
|
return {"status": "unhealthy", "error": str(e)}
|
||||||
|
|
||||||
@app.post("/ipa", response_model=IPAResponse)
|
@app.post("/ipa")
|
||||||
def get_word_ipa(request: IPARequest):
|
async def get_word_ipa(request: Request):
|
||||||
"""
|
"""
|
||||||
Get IPA transcription and tokens for a single word
|
Get IPA transcription and tokens for a single word
|
||||||
|
|
||||||
|
Request body:
|
||||||
|
{
|
||||||
|
"word": "hello",
|
||||||
|
"lang": "en"
|
||||||
|
}
|
||||||
"""
|
"""
|
||||||
ipa = get_ipa(request.word, request.lang)
|
data = await request.json()
|
||||||
|
|
||||||
|
word = data.get("word")
|
||||||
|
if not word:
|
||||||
|
raise HTTPException(status_code=400, detail="'word' field is required")
|
||||||
|
|
||||||
|
lang = data.get("lang", "en")
|
||||||
|
|
||||||
|
ipa = get_ipa(word, lang)
|
||||||
if not ipa:
|
if not ipa:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
detail=f"Could not get IPA for word '{request.word}'"
|
detail=f"Could not get IPA for word '{word}'"
|
||||||
)
|
)
|
||||||
|
|
||||||
tokens = ipa_tokenize(ipa)
|
tokens = ipa_tokenize(ipa)
|
||||||
|
|
||||||
return IPAResponse(
|
return {
|
||||||
word=request.word,
|
"word": word,
|
||||||
ipa=ipa,
|
"ipa": ipa,
|
||||||
tokens=tokens
|
"tokens": tokens
|
||||||
)
|
}
|
||||||
|
|
||||||
@app.post("/sort", response_model=SortResponse)
|
@app.post("/sort")
|
||||||
def sort_words(request: SortRequest):
|
async def sort_words(request: Request):
|
||||||
"""
|
"""
|
||||||
Sort words from text by phonetic similarity
|
Sort words from text by phonetic similarity
|
||||||
|
|
||||||
The algorithm:
|
Request body:
|
||||||
1. Tokenizes input text into words
|
{
|
||||||
2. Gets IPA transcription for each word
|
"text": "The quick brown fox jumps over the lazy dog",
|
||||||
3. Tokenizes IPA into phonemes
|
"lang": "en"
|
||||||
4. Uses nearest-neighbor seriation to order words by phonetic similarity
|
}
|
||||||
5. Returns ordered list with IPA transcriptions
|
|
||||||
"""
|
"""
|
||||||
if not request.text.strip():
|
data = await request.json()
|
||||||
raise HTTPException(status_code=400, detail="No text provided")
|
|
||||||
|
|
||||||
# Tokenize text into words
|
text = data.get("text")
|
||||||
words = tokenize_text(request.text)
|
if not text or not text.strip():
|
||||||
|
raise HTTPException(status_code=400, detail="'text' field is required")
|
||||||
|
|
||||||
|
lang = data.get("lang", "en")
|
||||||
|
|
||||||
|
words = tokenize_text(text)
|
||||||
|
|
||||||
if not words:
|
if not words:
|
||||||
raise HTTPException(status_code=400, detail="No valid words found in text")
|
raise HTTPException(status_code=400, detail="No valid words found in text")
|
||||||
|
|
||||||
# Remove duplicates while preserving order
|
original_count = len(words)
|
||||||
|
|
||||||
seen = set()
|
seen = set()
|
||||||
unique_words = []
|
unique_words = []
|
||||||
for word in words:
|
for word in words:
|
||||||
@@ -248,17 +417,14 @@ def sort_words(request: SortRequest):
|
|||||||
seen.add(word)
|
seen.add(word)
|
||||||
unique_words.append(word)
|
unique_words.append(word)
|
||||||
|
|
||||||
# Get IPA for all words
|
|
||||||
ipas = {}
|
ipas = {}
|
||||||
for word in unique_words:
|
for word in unique_words:
|
||||||
ipa = get_ipa(word, request.lang)
|
ipa = get_ipa(word, lang)
|
||||||
if ipa:
|
if ipa:
|
||||||
ipas[word] = tuple(ipa_tokenize(ipa))
|
ipas[word] = tuple(ipa_tokenize(ipa))
|
||||||
else:
|
else:
|
||||||
# If IPA fails, use empty tuple
|
|
||||||
ipas[word] = tuple()
|
ipas[word] = tuple()
|
||||||
|
|
||||||
# Filter out words with no IPA
|
|
||||||
valid_words = [w for w in unique_words if ipas[w]]
|
valid_words = [w for w in unique_words if ipas[w]]
|
||||||
|
|
||||||
if not valid_words:
|
if not valid_words:
|
||||||
@@ -267,20 +433,22 @@ def sort_words(request: SortRequest):
|
|||||||
detail="Could not get IPA transcription for any words"
|
detail="Could not get IPA transcription for any words"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Sort by phonetic similarity
|
|
||||||
ordered = seriate(valid_words, ipas)
|
ordered = seriate(valid_words, ipas)
|
||||||
|
|
||||||
# Build response
|
|
||||||
sorted_words = [
|
sorted_words = [
|
||||||
WordIPA(word=w, ipa="".join(ipas[w]))
|
{"word": w, "ipa": "".join(ipas[w])}
|
||||||
for w in ordered
|
for w in ordered
|
||||||
]
|
]
|
||||||
|
|
||||||
return SortResponse(
|
return {
|
||||||
sorted_words=sorted_words,
|
"sorted_words": sorted_words,
|
||||||
)
|
"original_count": original_count,
|
||||||
|
"unique_count": len(unique_words)
|
||||||
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
import os
|
import os
|
||||||
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000)))
|
|
||||||
|
port = int(os.environ.get("PORT", 8000))
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=port)
|
||||||
|
|||||||
Reference in New Issue
Block a user