commit 21e3ae3b1c38b18d6dc470781893fb62e4ff4210 Author: ge Date: Wed Apr 26 16:22:48 2023 +0300 init diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..dcb767f --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +bottle = "*" +gunicorn = "*" + +[dev-packages] + +[requires] +python_version = "3.10" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..bfcb254 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,45 @@ +{ + "_meta": { + "hash": { + "sha256": "6cf82d4043d4c4bf722b99766ec56d49b04447a5e89726986ac957c46efc2196" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.10" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "bottle": { + "hashes": [ + "sha256:d6f15f9d422670b7c073d63bd8d287b135388da187a0f3e3c19293626ce034ea", + "sha256:e1a9c94970ae6d710b3fb4526294dfeb86f2cb4a81eff3a4b98dc40fb0e5e021" + ], + "index": "pypi", + "version": "==0.12.25" + }, + "gunicorn": { + "hashes": [ + "sha256:9dcc4547dbb1cb284accfb15ab5667a0e5d1881cc443e0677b4882a4067a807e", + "sha256:e0a968b5ba15f8a328fdfd7ab1fcb5af4470c28aaf7e55df02a99bc13138e6e8" + ], + "index": "pypi", + "version": "==20.1.0" + }, + "setuptools": { + "hashes": [ + "sha256:23aaf86b85ca52ceb801d32703f12d77517b2556af839621c641fca11287952b", + "sha256:f104fa03692a2602fa0fec6c6a9e63b6c8a968de13e17c026957dd1f53d80990" + ], + "markers": "python_version >= '3.7'", + "version": "==67.7.2" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..a9f5dba --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# Cursed API for ArchiveBox + +ArchiveBox [doesn't have a web API](https://github.com/ArchiveBox/ArchiveBox/issues/496) yet. This is shitty single-endpoint API to automate page archiving. It uses subprocess to run archivebox CLI. archivebox CLI runs in new thread to avoid blocking main thread. + +# Install and run + +Install dependencies: + +``` +pip install bottle gunicorn +``` + +Start API on server where ArchiveBox container is running. Set actual path to docker-compose.yml. + +``` +ARCHIVEBOX_BIN="docker compose -f /opt/archievebox/docker-compose.yml run archivebox" python cursed_archivebox_api.py +``` + +# Environment + +| Variable | Default | +| ----------------- | --------------------- | +| `ARCHIVEBOX_BIN` | `/usr/bin/archivebox` (default for non-Docker installations) | +| `CURSED_PORT` | `9998` | +| `CURSED_HOST` | `0.0.0.0` | +| `CURSED_SERVER` | `gunicorn` See [server backends](https://bottlepy.org/docs/dev/deployment.html#switching-the-server-backend) | + +# GET /add + +Query parameters: + +* `url`. Resource URL +* `depth`. Archive depth. Default: 0 (current page) +* `tag`. List of comma separated tags e.g. `my_tag`, `my_tag,another_one`. + +Example: + +``` +curl -i 'http://localhost:9998/add?url=https://example.com&depth=0&tag=api,example' +``` diff --git a/cursed_archivebox_api.py b/cursed_archivebox_api.py new file mode 100644 index 0000000..bb4d3ff --- /dev/null +++ b/cursed_archivebox_api.py @@ -0,0 +1,67 @@ +import os +import json +import logging +import subprocess +from multiprocessing import Process + +from bottle import run, get, request, response + + +ARCHIVEBOX_BIN = os.getenv('ARCHIVEBOX_BIN') or '/usr/bin/archivebox' +CURSED_PORT = os.getenv('CURSED_PORT') or 9998 +CURSED_HOST = os.getenv('CURSED_HOST') or '0.0.0.0' +CURSED_SERVER = os.getenv('CURSED_SERVER') or 'gunicorn' + + +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(name)s: %(message)s" +) + + +def shell_exec(command: list, to_stdin: str = None) -> None: + """Execute shell command and return output.""" + pipe = subprocess.Popen(command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if to_stdin: + to_stdin = '%s\n' % to_stdin + pipe.stdin.write(to_stdin.encode('utf-8')) + pipe.stdin.flush() + output, error = pipe.communicate() + output = output.strip().decode("utf-8") + error = error.decode("utf-8") + if pipe.returncode != 0: + raise RuntimeError(error) + return output + + +def run_bg_task(cmd): + logging.debug('PID=%s Run "background" thread...', os.getpid()) + shell_exec(cmd) + logging.debug('PID=%s Background thread finished', os.getpid()) + + +@get('/add') +def add_to_archive() -> str: + url = request.query.url or None + depth = request.query.depth or None + tag = request.query.tag or None + cmd = ARCHIVEBOX_BIN.split() + cmd.append("add") + if depth: + cmd.append('--depth=' + str(depth)) + if tag: + cmd.append('--tag=' + tag) + if url is None: + response.status = 400 + return json.dumps({'msg': 'Error: No URL query parameter provided'}) + cmd.append("'" + url + "'") + logging.debug('PID=%s Command to run: %s', os.getpid(), cmd) + taskrun = Process(target=run_bg_task, args=(cmd,)) + taskrun.start() + return json.dumps({'msg': 'OK'}) + + +run(server=CURSED_SERVER, host=CURSED_HOST, port=CURSED_PORT) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2bc8054 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +-i https://pypi.org/simple +bottle==0.12.25 +gunicorn==20.1.0 +setuptools==67.7.2 ; python_version >= '3.7'