2022-03-26 17:45:21 +01:00
|
|
|
#!/bin/sh
|
|
|
|
|
root=https://www.orte-in-deutschland.de
|
|
|
|
|
|
|
|
|
|
extract_orte() {
|
2022-03-26 17:50:53 +01:00
|
|
|
htmlq 'a[href*="-gemeinde-"], a[href*="-ort-"]' --attribute href \
|
|
|
|
|
| sed "s#^#$root/#"
|
2022-03-26 17:45:21 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
curl -sSL "$root/alphabetisches-ortsverzeichnis.html" \
|
|
|
|
|
| htmlq '#suchindex li a[href^="orte-in"]' --attribute href \
|
|
|
|
|
| while read -r slug; do
|
|
|
|
|
curl -sSL "$root/$slug" \
|
|
|
|
|
| htmlq '#subsuchindex li a[href^="orte-in"]' --attribute href \
|
|
|
|
|
| while read -r slug; do
|
|
|
|
|
first_page="$(curl -sSL "$root/$slug")"
|
|
|
|
|
echo "$first_page" | extract_orte
|
2022-03-27 19:09:33 +02:00
|
|
|
echo "$first_page" \
|
|
|
|
|
| htmlq 'a[href*="?seite="]' --attribute href \
|
|
|
|
|
| sed 's/.*seite=//' \
|
|
|
|
|
| sort -un \
|
|
|
|
|
| sed -n '1p;$p' \
|
|
|
|
|
| xargs seq 2>/dev/null | while read -r page; do
|
|
|
|
|
curl -sSL "$root/$slug?seite=$page" | extract_orte
|
|
|
|
|
done
|
2022-03-26 17:45:21 +01:00
|
|
|
done
|
|
|
|
|
done
|