From df015db743e1ae6ce40ac58a79767926feaa54bd Mon Sep 17 00:00:00 2001 From: kmein Date: Tue, 22 May 2018 10:57:44 +0200 Subject: [PATCH 1/6] Add program and runner script --- epub.css | 3 +++ kevin.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kevin.sh | 2 ++ 3 files changed, 81 insertions(+) create mode 100644 epub.css create mode 100755 kevin.py create mode 100755 kevin.sh diff --git a/epub.css b/epub.css new file mode 100644 index 0000000..cb7e08e --- /dev/null +++ b/epub.css @@ -0,0 +1,3 @@ +body{margin:40px auto;max-width:650px;line-height:1.6;font-size:18px;color:#444;padding:0} +a{color:inherit;text-decoration:none} +a:hover{text-decoration:underline} diff --git a/kevin.py b/kevin.py new file mode 100755 index 0000000..f8d3836 --- /dev/null +++ b/kevin.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +from argparse import ArgumentParser +from bs4 import BeautifulSoup +from datetime import datetime +from typing import List +import re +import requests + + +def soup_from(url): + return BeautifulSoup(requests.get(url).text, "lxml") + + +class Author: + def __init__(self, author_id: int) -> None: + author_texts_url = "https://www.keinverlag.de/autorentexte.php?start=0&limit=1000000&sortby=tnr&autor={}".format(author_id) + soup = soup_from(author_texts_url) + self.texts = [] # type: List[Text] + for text in soup.select("ul.textliste > li > a[href$=\".text\"]"): + # strip off the last five characters (".text") + text_id = int(text["href"][:-5]) + try: + self.texts.append(Text(text_id)) + except ValueError: + continue + + def markdown(self, *, with_type: bool = False) -> str: + name = self.texts[0].author + + def __gen(): + yield "% {}".format(name) + for text in self.texts: + yield "\n\n* * *\n\n" + yield text.markdown(with_author=False, with_type=with_type) + + return "\n".join(__gen()) + + +class Text: + def __init__(self, text_id: int) -> None: + normalization = {132: "\"", 147: "\"", 0x96: "--", 0x91: "'", 0x92: "'", 0x97: "---"} + text_url = "https://www.keinverlag.de/{}.text".format(text_id) + soup = soup_from(text_url) + try: + self.title = soup.select("h1 > span")[0].text.translate(normalization) + content = str(soup.select(".fliesstext > span")[0]) + content = re.sub(r'(([\n\r]|.)*?)', r"_\1_", content) + self.content = BeautifulSoup(content, "lxml").text.translate(normalization) + self.author = soup.select("h3 > a")[2].text + self.type = soup.select("h1 ~ h3")[0].text + except IndexError: + raise ValueError("Text {} not available.".format(text_id)) + + def markdown(self, *, with_author: bool = True, with_type: bool = False) -> str: + return "#### {maybe_author}{title}{maybe_type}\n\n{content}".format( + title=self.title, + maybe_author=self.author + ": " if with_author else "", + maybe_type=" ("+self.type+")" if with_type else "", + content="\n".join(line + "\\" for line in self.content.splitlines())) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--type", help="Include text type", action="store_true") + subparsers = parser.add_subparsers() + + handle_text = subparsers.add_parser("text", help="Handle one text") + handle_text.add_argument("tid", help="KeinVerlag text id", type=int) + handle_text.set_defaults(func=lambda a: print(Text(a.tid).markdown(with_type=a.type))) + + handle_author = subparsers.add_parser("author", help="Handle all texts by an author") + handle_author.add_argument("aid", help="KeinVerlag author id", type=str) + handle_author.set_defaults(func=lambda a: print(Author(a.aid).markdown(with_type=a.type))) + + args = parser.parse_args() + args.func(args) diff --git a/kevin.sh b/kevin.sh new file mode 100755 index 0000000..474f091 --- /dev/null +++ b/kevin.sh @@ -0,0 +1,2 @@ +#!/bin/sh +python3 kevin.py author "$1" | pandoc -f markdown+smart --table-of-contents --toc-depth=6 --standalone --css=epub.css -o "$2" From 2dcf6007075ed14f03c38c7ec6676616db781d90 Mon Sep 17 00:00:00 2001 From: kmein Date: Tue, 22 May 2018 12:07:45 +0200 Subject: [PATCH 2/6] Small fixes - Remove unused import ~ Reformat text content normalization + Keep empty lines empty instead of replacing them with "\" --- kevin.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kevin.py b/kevin.py index f8d3836..9fac409 100755 --- a/kevin.py +++ b/kevin.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 from argparse import ArgumentParser from bs4 import BeautifulSoup -from datetime import datetime from typing import List import re import requests @@ -43,9 +42,11 @@ class Text: soup = soup_from(text_url) try: self.title = soup.select("h1 > span")[0].text.translate(normalization) - content = str(soup.select(".fliesstext > span")[0]) - content = re.sub(r'(([\n\r]|.)*?)', r"_\1_", content) - self.content = BeautifulSoup(content, "lxml").text.translate(normalization) + self.content = BeautifulSoup(re.sub( + r'(([\n\r]|.)*?)', + r"_\1_", + str(soup.select(".fliesstext > span")[0]) + ), "lxml").text.translate(normalization) self.author = soup.select("h3 > a")[2].text self.type = soup.select("h1 ~ h3")[0].text except IndexError: @@ -56,7 +57,7 @@ class Text: title=self.title, maybe_author=self.author + ": " if with_author else "", maybe_type=" ("+self.type+")" if with_type else "", - content="\n".join(line + "\\" for line in self.content.splitlines())) + content="\n".join(line + "\\" if line else line for line in self.content.splitlines())) if __name__ == "__main__": From a5426f8e51d8f06ef7b446a53f39343ab60e7145 Mon Sep 17 00:00:00 2001 From: kmein Date: Tue, 22 May 2018 12:10:30 +0200 Subject: [PATCH 3/6] "Explicit is better than implicit." ~ Markdown generation for empty lines --- kevin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kevin.py b/kevin.py index 9fac409..8b1d9ad 100755 --- a/kevin.py +++ b/kevin.py @@ -57,7 +57,7 @@ class Text: title=self.title, maybe_author=self.author + ": " if with_author else "", maybe_type=" ("+self.type+")" if with_type else "", - content="\n".join(line + "\\" if line else line for line in self.content.splitlines())) + content="\n".join(line + "\\" if line else "" for line in self.content.splitlines())) if __name__ == "__main__": From bed2a8a40d4377585a68bbcf5fc20c76e290149b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 1 Oct 2019 19:22:59 +0200 Subject: [PATCH 4/6] feat: add shell version --- .gitignore | 3 +++ keinverlag | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ kevin.py | 54 +++++++++++++++++++++++++++++++++++--------------- shell.nix | 10 ++++++++++ 4 files changed, 109 insertions(+), 16 deletions(-) create mode 100644 .gitignore create mode 100755 keinverlag create mode 100644 shell.nix diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..806fbfb --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.direnv +.envrc +.history diff --git a/keinverlag b/keinverlag new file mode 100755 index 0000000..90b1972 --- /dev/null +++ b/keinverlag @@ -0,0 +1,58 @@ +#!/bin/sh + +kv_GET () { + route=$1 + shift + curl -s "https://www.keinverlag.de$route" "$@" | iconv -f latin1 -t utf8 +} + +kv_author_id () { + if [ $# -ne 1 ]; then + echo Please call kv_author_id with an author name. >/dev/stderr + exit 1 + fi + + author_name=$1 + + kv_GET "/$author_name.kv" \ + | sed -n 's/.*autor=\([0-9]\+\).*/\1/p' \ + | head -1 +} + +kv_text () { + if [ $# -ne 1 ]; then + echo Please call kv_text with a text ID. >/dev/stderr + exit 1 + fi + + text_id=$1 + + kv_GET "/$text_id.text" \ + | sed -n '/

/,//p' \ + | pandoc -f html -t plain +} + +kv_author_texts () { + if [ $# -ne 1 ]; then + echo Please call kv_author_texts with an author ID. >/dev/stderr + exit 1 + fi + + author_id=$1 + + kv_GET '/autorentexte.php' -d sortby=datum -d start=0 -d limit=10000 -d autor="$author_id" \ + | sed -n 's/.*.*/\1/p' +} + +case $1 in + text) + shift + kv_text "$@";; + author) + shift + for text_id in $(kv_author_texts "$(kv_author_id "$@")"); do + kv_text "$text_id" + done ;; + *) + echo >/dev/stderr "Usage: $0 text|author ID" +esac diff --git a/kevin.py b/kevin.py index 8b1d9ad..9b4cbe1 100755 --- a/kevin.py +++ b/kevin.py @@ -6,16 +6,19 @@ import re import requests -def soup_from(url): - return BeautifulSoup(requests.get(url).text, "lxml") +def soup_from(response): + return BeautifulSoup(response.text, "lxml") class Author: def __init__(self, author_id: int) -> None: - author_texts_url = "https://www.keinverlag.de/autorentexte.php?start=0&limit=1000000&sortby=tnr&autor={}".format(author_id) - soup = soup_from(author_texts_url) + response = requests.get( + "https://www.keinverlag.de/autorentexte.php", + params={"start": 0, "limit": 10000, "sortby": "tnr", "author": author_id}, + ) + soup = soup_from(response) self.texts = [] # type: List[Text] - for text in soup.select("ul.textliste > li > a[href$=\".text\"]"): + for text in soup.select('ul.textliste > li > a[href$=".text"]'): # strip off the last five characters (".text") text_id = int(text["href"][:-5]) try: @@ -37,16 +40,26 @@ class Author: class Text: def __init__(self, text_id: int) -> None: - normalization = {132: "\"", 147: "\"", 0x96: "--", 0x91: "'", 0x92: "'", 0x97: "---"} + normalization = { + 132: '"', + 147: '"', + 0x96: "--", + 0x91: "'", + 0x92: "'", + 0x97: "---", + } text_url = "https://www.keinverlag.de/{}.text".format(text_id) soup = soup_from(text_url) try: self.title = soup.select("h1 > span")[0].text.translate(normalization) - self.content = BeautifulSoup(re.sub( - r'(([\n\r]|.)*?)', - r"_\1_", - str(soup.select(".fliesstext > span")[0]) - ), "lxml").text.translate(normalization) + self.content = BeautifulSoup( + re.sub( + r'(([\n\r]|.)*?)', + r"_\1_", + str(soup.select(".fliesstext > span")[0]), + ), + "lxml", + ).text.translate(normalization) self.author = soup.select("h3 > a")[2].text self.type = soup.select("h1 ~ h3")[0].text except IndexError: @@ -56,8 +69,11 @@ class Text: return "#### {maybe_author}{title}{maybe_type}\n\n{content}".format( title=self.title, maybe_author=self.author + ": " if with_author else "", - maybe_type=" ("+self.type+")" if with_type else "", - content="\n".join(line + "\\" if line else "" for line in self.content.splitlines())) + maybe_type=" (" + self.type + ")" if with_type else "", + content="\n".join( + line + "\\" if line else "" for line in self.content.splitlines() + ), + ) if __name__ == "__main__": @@ -67,11 +83,17 @@ if __name__ == "__main__": handle_text = subparsers.add_parser("text", help="Handle one text") handle_text.add_argument("tid", help="KeinVerlag text id", type=int) - handle_text.set_defaults(func=lambda a: print(Text(a.tid).markdown(with_type=a.type))) + handle_text.set_defaults( + func=lambda a: print(Text(a.tid).markdown(with_type=a.type)) + ) - handle_author = subparsers.add_parser("author", help="Handle all texts by an author") + handle_author = subparsers.add_parser( + "author", help="Handle all texts by an author" + ) handle_author.add_argument("aid", help="KeinVerlag author id", type=str) - handle_author.set_defaults(func=lambda a: print(Author(a.aid).markdown(with_type=a.type))) + handle_author.set_defaults( + func=lambda a: print(Author(a.aid).markdown(with_type=a.type)) + ) args = parser.parse_args() args.func(args) diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..59c35ed --- /dev/null +++ b/shell.nix @@ -0,0 +1,10 @@ +{ pkgs ? import {} }: +pkgs.mkShell { + buildInputs = with pkgs; [ + pandoc + python3Packages.beautifulsoup4 + python3Packages.requests + python3Packages.lxml + ]; + shellHook = "export HISTFILE=${toString ./.history}"; +} From 9d755bb4202c7032442c6bdc81483656b34af4c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 1 Oct 2019 21:02:16 +0200 Subject: [PATCH 5/6] feat: cut out h3s so only the text is left --- keinverlag | 1 + 1 file changed, 1 insertion(+) diff --git a/keinverlag b/keinverlag index 90b1972..5efe721 100755 --- a/keinverlag +++ b/keinverlag @@ -29,6 +29,7 @@ kv_text () { kv_GET "/$text_id.text" \ | sed -n '/

/,//p' \ + | sed 's/

.\+<\/h3>//g' \ | pandoc -f html -t plain } From e8409354e4ba381744e2ef98c952402901f3fb92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kier=C3=A1n=20Meinhardt?= Date: Tue, 1 Oct 2019 21:34:50 +0200 Subject: [PATCH 6/6] fix: use aliases to correctly and uniquely get all texts --- keinverlag | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/keinverlag b/keinverlag index 5efe721..cde6f39 100755 --- a/keinverlag +++ b/keinverlag @@ -1,10 +1,9 @@ #!/bin/sh -kv_GET () { - route=$1 - shift - curl -s "https://www.keinverlag.de$route" "$@" | iconv -f latin1 -t utf8 -} +alias to_utf8='iconv -f latin1 -t utf8' +alias curl_GET='curl -X GET -s -G' + +BASE_URL=https://www.keinverlag.de kv_author_id () { if [ $# -ne 1 ]; then @@ -14,7 +13,8 @@ kv_author_id () { author_name=$1 - kv_GET "/$author_name.kv" \ + curl_GET "$BASE_URL/$author_name.kv" \ + | to_utf8 \ | sed -n 's/.*autor=\([0-9]\+\).*/\1/p' \ | head -1 } @@ -27,7 +27,8 @@ kv_text () { text_id=$1 - kv_GET "/$text_id.text" \ + curl_GET "$BASE_URL/$text_id.text" \ + | to_utf8 \ | sed -n '/

/,//p' \ | sed 's/

.\+<\/h3>//g' \ | pandoc -f html -t plain @@ -41,8 +42,9 @@ kv_author_texts () { author_id=$1 - kv_GET '/autorentexte.php' -d sortby=datum -d start=0 -d limit=10000 -d autor="$author_id" \ - | sed -n 's/.*.*/\1/p' + curl_GET "$BASE_URL/autorentexte.php" -d sortby=tnr -d start=0 -d limit=10000 -d autor="$author_id" \ + | to_utf8 \ + | sed -n 's/.*
  • .*/\1/p' } case $1 in @@ -51,7 +53,7 @@ case $1 in kv_text "$@";; author) shift - for text_id in $(kv_author_texts "$(kv_author_id "$@")"); do + for text_id in $(kv_author_texts "$(kv_author_id "$@")" | uniq); do kv_text "$text_id" done ;; *)