feat(orte): begin scraping

This commit is contained in:
2022-03-26 17:45:21 +01:00
parent d34a8161ba
commit 736f1f37a0
2 changed files with 33836 additions and 0 deletions

20
orte/list-orte.sh Executable file
View File

@@ -0,0 +1,20 @@
#!/bin/sh
root=https://www.orte-in-deutschland.de
extract_orte() {
htmlq 'a[href*="-gemeinde-"], a[href*="-ort-"]' --attribute href
}
curl -sSL "$root/alphabetisches-ortsverzeichnis.html" \
| htmlq '#suchindex li a[href^="orte-in"]' --attribute href \
| while read -r slug; do
curl -sSL "$root/$slug" \
| htmlq '#subsuchindex li a[href^="orte-in"]' --attribute href \
| while read -r slug; do
first_page="$(curl -sSL "$root/$slug")"
echo "$first_page" | extract_orte
echo "$first_page" | htmlq 'a[href*="?seite="]' --attribute href | sort -u | while read -r page; do
curl -sSL "$root/$slug$page" | extract_orte
done
done
done

33816
orte/orte.txt Normal file

File diff suppressed because it is too large Load Diff