diff --git a/pun-sort/default.nix b/pun-sort/default.nix index 14ba217..13d51f7 100644 --- a/pun-sort/default.nix +++ b/pun-sort/default.nix @@ -4,9 +4,11 @@ writers.writePython3Bin "pun_sort_api.py" { "E203" "E203" "E226" + "E265" "E302" "E305" "E501" + "F401" "F841" "W503" ]; diff --git a/pun-sort/sort_api.py b/pun-sort/sort_api.py index 281269d..3046bfc 100755 --- a/pun-sort/sort_api.py +++ b/pun-sort/sort_api.py @@ -1,11 +1,12 @@ +#!/usr/bin/env python3 """ FastAPI backend for phonetic word sorting Sorts words by their phonetic similarity using espeak-ng IPA transcription """ -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, Field -from typing import List +from fastapi.responses import HTMLResponse +from typing import List, Dict, Any import string import subprocess from functools import lru_cache @@ -25,37 +26,6 @@ app.add_middleware( allow_headers=["*"], ) -# ------------------------- -# Models -# ------------------------- -class SortRequest(BaseModel): - text: str = Field(..., description="Text containing words to sort") - lang: str = Field("en", description="Language code for espeak-ng (e.g., 'en', 'de', 'es')") - - class Config: - schema_extra = { - "example": { - "text": "The quick brown fox jumps over the lazy dog", - "lang": "en" - } - } - -class WordIPA(BaseModel): - word: str - ipa: str - -class SortResponse(BaseModel): - sorted_words: List[WordIPA] - -class IPARequest(BaseModel): - word: str - lang: str = Field("en", description="Language code for espeak-ng") - -class IPAResponse(BaseModel): - word: str - ipa: str - tokens: List[str] - # ------------------------- # IPA helpers # ------------------------- @@ -76,7 +46,7 @@ def get_ipa(word: str, lang: str = "en") -> str: status_code=500, detail="espeak-ng not found. Please install it: apt-get install espeak-ng" ) - except Exception as e: + except Exception: return "" def ipa_tokenize(ipa: str) -> List[str]: @@ -85,11 +55,9 @@ def ipa_tokenize(ipa: str) -> List[str]: i = 0 while i < len(ipa): ch = ipa[i] - # Skip stress markers if ch in "ˈˌ": i += 1 continue - # Check for diphthongs if i + 1 < len(ipa) and ipa[i:i+2] in {"aɪ", "aʊ", "eɪ", "oʊ", "ɔɪ"}: tokens.append(ipa[i:i+2]) i += 2 @@ -139,10 +107,9 @@ def tokenize_text(text: str) -> List[str]: Tokenize text into words, removing punctuation. Handles Unicode letters (ä, ö, ü, ß, é, ñ, etc.) """ - # Remove punctuation and split into words cleaned = text.translate(str.maketrans('', '', string.punctuation)) tokens = cleaned.split() - return [word.lower() for word in tokens] + return tokens # ------------------------- # Seriation algorithm @@ -172,13 +139,198 @@ def seriate(words: List[str], ipas: dict) -> List[str]: # ------------------------- # API Endpoints # ------------------------- -@app.get("/") -def root(): - """Root endpoint with API information""" +@app.get("/", response_class=HTMLResponse) +async def root(): + """Serve HTML interface""" + return """ + + +
+ + +Sort words by their phonetic similarity using IPA transcription
+ + + + + + + + + + + + + + + """ + +@app.get("/api", response_class=HTMLResponse) +async def api_info(): + """API information endpoint""" return { "name": "Phonetic Word Sorter API", "version": "1.0.0", "endpoints": { + "GET /": "Web interface", "POST /sort": "Sort words by phonetic similarity", "POST /ipa": "Get IPA transcription for a single word", "GET /health": "Health check" @@ -186,10 +338,9 @@ def root(): } @app.get("/health") -def health_check(): +async def health_check(): """Health check endpoint""" try: - # Test espeak-ng availability subprocess.run( ["espeak-ng", "--version"], capture_output=True, @@ -199,48 +350,66 @@ def health_check(): except Exception as e: return {"status": "unhealthy", "error": str(e)} -@app.post("/ipa", response_model=IPAResponse) -def get_word_ipa(request: IPARequest): +@app.post("/ipa") +async def get_word_ipa(request: Request): """ Get IPA transcription and tokens for a single word + + Request body: + { + "word": "hello", + "lang": "en" + } """ - ipa = get_ipa(request.word, request.lang) + data = await request.json() + + word = data.get("word") + if not word: + raise HTTPException(status_code=400, detail="'word' field is required") + + lang = data.get("lang", "en") + + ipa = get_ipa(word, lang) if not ipa: raise HTTPException( status_code=400, - detail=f"Could not get IPA for word '{request.word}'" + detail=f"Could not get IPA for word '{word}'" ) tokens = ipa_tokenize(ipa) - return IPAResponse( - word=request.word, - ipa=ipa, - tokens=tokens - ) + return { + "word": word, + "ipa": ipa, + "tokens": tokens + } -@app.post("/sort", response_model=SortResponse) -def sort_words(request: SortRequest): +@app.post("/sort") +async def sort_words(request: Request): """ Sort words from text by phonetic similarity - The algorithm: - 1. Tokenizes input text into words - 2. Gets IPA transcription for each word - 3. Tokenizes IPA into phonemes - 4. Uses nearest-neighbor seriation to order words by phonetic similarity - 5. Returns ordered list with IPA transcriptions + Request body: + { + "text": "The quick brown fox jumps over the lazy dog", + "lang": "en" + } """ - if not request.text.strip(): - raise HTTPException(status_code=400, detail="No text provided") + data = await request.json() - # Tokenize text into words - words = tokenize_text(request.text) + text = data.get("text") + if not text or not text.strip(): + raise HTTPException(status_code=400, detail="'text' field is required") + + lang = data.get("lang", "en") + + words = tokenize_text(text) if not words: raise HTTPException(status_code=400, detail="No valid words found in text") - # Remove duplicates while preserving order + original_count = len(words) + seen = set() unique_words = [] for word in words: @@ -248,17 +417,14 @@ def sort_words(request: SortRequest): seen.add(word) unique_words.append(word) - # Get IPA for all words ipas = {} for word in unique_words: - ipa = get_ipa(word, request.lang) + ipa = get_ipa(word, lang) if ipa: ipas[word] = tuple(ipa_tokenize(ipa)) else: - # If IPA fails, use empty tuple ipas[word] = tuple() - # Filter out words with no IPA valid_words = [w for w in unique_words if ipas[w]] if not valid_words: @@ -267,20 +433,22 @@ def sort_words(request: SortRequest): detail="Could not get IPA transcription for any words" ) - # Sort by phonetic similarity ordered = seriate(valid_words, ipas) - # Build response sorted_words = [ - WordIPA(word=w, ipa="".join(ipas[w])) + {"word": w, "ipa": "".join(ipas[w])} for w in ordered ] - return SortResponse( - sorted_words=sorted_words, - ) + return { + "sorted_words": sorted_words, + "original_count": original_count, + "unique_count": len(unique_words) + } if __name__ == "__main__": import uvicorn import os - uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000))) + + port = int(os.environ.get("PORT", 8000)) + uvicorn.run(app, host="0.0.0.0", port=port)