From eef8ca29f0c359263af9f988001ed127ae25432c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 30 Mar 2019 20:31:28 -0400 Subject: [PATCH 001/333] hide compression detection failure during config setup --- archivebox/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index 0d49a5d2..23a92ebf 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -74,7 +74,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC -WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) +WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL, stderr=DEVNULL).returncode) URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) From 924de7f68c315cd55fcfb77257b01f28365f855c Mon Sep 17 00:00:00 2001 From: luoliyan Date: Tue, 2 Apr 2019 13:13:07 +0930 Subject: [PATCH 002/333] Update purge script to match codebase cleanup --- archivebox/purge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/purge.py b/archivebox/purge.py index 26b18817..e2e4e97c 100755 --- a/archivebox/purge.py +++ b/archivebox/purge.py @@ -6,9 +6,9 @@ from os.path import exists, join from shutil import rmtree from typing import List -from archive import parse_json_link_index from config import ARCHIVE_DIR, OUTPUT_DIR -from index import write_html_links_index, write_json_links_index +from index import (parse_json_links_index, write_html_links_index, + write_json_links_index) def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: @@ -16,7 +16,7 @@ def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: exit('index.json is missing; nothing to do') compiled = [re.compile(r) for r in regexes] - links = parse_json_link_index(OUTPUT_DIR)['links'] + links = parse_json_links_index(OUTPUT_DIR) filtered = [] remaining = [] From 0d2bf610b2ed82c87c78c3655a1f6512551f2ddb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 3 Apr 2019 03:27:15 -0400 Subject: [PATCH 003/333] typo fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 15358d5d..5c698868 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ I don't think everything should be preserved in an automated fashion, making all #### User Interface & Intended Purpose -ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI inferface for users to ingest built feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. +ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI inferface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. An alternative tool [pywb](https://github.com/webrecorder/pywb) allows you to run a browser through an always-running archiving proxy which records the traffic to WARC files. ArchiveBox intends to support this style of live proxy-archiving using `pywb` in the future, but for now it only ingests lists of links at a time via browser history, bookmarks, RSS, etc. From 585a28e7c919c980a3d0e53cdac835ed3748b630 Mon Sep 17 00:00:00 2001 From: Anton Rieder <1301152+aried3r@users.noreply.github.com> Date: Wed, 3 Apr 2019 12:49:32 +0200 Subject: [PATCH 004/333] Small typo fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5c698868..0fc21154 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ echo 'https://example.com' | ./archive # pass URLs to archive v ./archive https://getpocket.com/users/example/feed/all # or import an RSS/JSON/XML/TXT feed ``` -One you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archive.sweeting.me](https://archive.sweeting.me) +Once you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archive.sweeting.me](https://archive.sweeting.me) For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs. *(`pip install archivebox` will be available in the near future, follow our [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for progress)* From 403025a73b1d96ebcd2dba8c681c63529a5a4980 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 10 Apr 2019 17:09:54 -0400 Subject: [PATCH 005/333] Update bug_report.md --- .github/ISSUE_TEMPLATE/bug_report.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b350fb28..66a2d21b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,24 +7,24 @@ assignees: '' --- -(please fill out the following information, feel free to delete sections if they're not applicable) +(please fill out the following information, feel free to delete sections if they're not applicable or if long issue templates annoy you) -## Describe the bug +#### Describe the bug A description of what the bug is, what you expected to happen, and any relevant context about issue. -## Steps to reproduce +#### Steps to reproduce 1. Ran ArchiveBox with the following config '...' 2. Saw this output during archiving '....' 3. UI didn't show the thing I was expecting '....' -## Screenshots or log output +#### Screenshots or log output If applicable, post any relevant screenshots or copy/pasted terminal output from ArchiveBox. If you're reporting a parsing / importing error, **you must paste a copy of your redacted import file here**. -## Software versions +#### Software versions - OS: ([e.g. macOS 10.14] the operating system you're running ArchiveBox on) - ArchiveBox version: (`git rev-parse HEAD | head -c7` [e.g. d798117] commit ID of the version you're running) From 4f599c0b0b07c842b1a2d0ec31f229d8fa0d6294 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 10 Apr 2019 22:46:20 -0400 Subject: [PATCH 006/333] escape all non-windows-friendly filenames --- archivebox/archive_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 56009cd1..b2f04f33 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -226,7 +226,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--span-hosts', '--no-parent', '-e', 'robots=off', - '--restrict-file-names=unix', + '--restrict-file-names=windows', '--timeout={}'.format(timeout), *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), From e9f9c1ec5da2433ef95b23d7526d69458e01ad3c Mon Sep 17 00:00:00 2001 From: Bruno Tavares Date: Thu, 11 Apr 2019 22:43:52 -0300 Subject: [PATCH 007/333] Copy project into image instead of cloning Docker `RUN` statements cache based on the text of the command executed, not the content of what it does to the image. Since the command was cloning the project, and the text didn't change, building the image would not update the code if the image was already cached. This lead to a stale Docker image distributed on Docker Hub. This could also cause some confusion, as modified code would not show up on the image during the build process. This commit changes the build process to copy the content of the project into the image. Whenever a file changes it will trigger a new updated image. --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index d5683cad..c53e5c7a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,8 +45,8 @@ RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \ && chown -R pptruser:pptruser /node_modules # Install the ArchiveBox repository and pip requirements -RUN git clone https://github.com/pirate/ArchiveBox /home/pptruser/app \ - && mkdir -p /data \ +COPY . /home/pptruser/app +RUN mkdir -p /data \ && chown -R pptruser:pptruser /data \ && ln -s /data /home/pptruser/app/archivebox/output \ && ln -s /home/pptruser/app/bin/* /bin/ \ From 6401158f7f30f04a15bd070d9a94416a1c621e77 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 12 Apr 2019 13:59:22 -0400 Subject: [PATCH 008/333] comment out IRC links until we find a better chat solution --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0fc21154..435e8b82 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ "Your own personal internet archive" (网站存档 / 爬虫) - + @@ -178,7 +178,7 @@ Because ArchiveBox is designed to ingest a firehose of browser history and bookm ## Learn more -▶ **Join out our [community chat](http://webchat.freenode.net?channels=ArchiveBox&uio=d4) hosted on IRC freenode.net:`#ArchiveBox`!** + Whether you want learn which organizations are the big players in the web archiving space, want to find a specific open source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! @@ -268,7 +268,7 @@ Contributor Spotlight:


- +

From adfcb1517a086d77441ff6b4d9d766a5c8d94d84 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Apr 2019 21:03:51 -0400 Subject: [PATCH 009/333] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 435e8b82..6b36c859 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,9 @@
+ +*Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info...* + **ArchiveBox takes a list of website URLs you want to archive, and creates a local, static, browsable HTML clone of the content from those websites (it saves HTML, JS, media files, PDFs, images and more).** From 24e8eb95ddf1af7040e539503a56c1dc55774bcc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Apr 2019 21:04:18 -0400 Subject: [PATCH 010/333] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b36c859..63fa7f32 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@
-*Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info...* +*💥 Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info... 💥* From 59da48206ad7f64ea9b5a7e869d47a87e5534c3a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Apr 2019 21:21:40 -0400 Subject: [PATCH 011/333] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 63fa7f32..1622c393 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@
-*💥 Attention: Big API changes are coming soon! See [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) for more info... 💥* +*💥 Attention: Big API changes are coming soon! Check out [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) and help us test it! 💥* From 332a32f4f9b6f548d9a61495ec9008667ca1f5f6 Mon Sep 17 00:00:00 2001 From: Drewry Pope Date: Sat, 20 Apr 2019 02:59:44 -0500 Subject: [PATCH 012/333] Resolve 3 typos in util.py --- archivebox/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index cec23035..3c08c9bb 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -66,7 +66,7 @@ HTML_TITLE_REGEX = re.compile( re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, ) STATICFILE_EXTENSIONS = { - # 99.999% of the time, URLs ending in these extentions are static files + # 99.999% of the time, URLs ending in these extensions are static files # that can be downloaded as-is, not html pages that need to be rendered 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', @@ -82,7 +82,7 @@ STATICFILE_EXTENSIONS = { # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml - # Thse are always treated as pages, not as static files, never add them: + # These are always treated as pages, not as static files, never add them: # html, htm, shtml, xhtml, xml, aspx, php, cgi } @@ -293,7 +293,7 @@ def str_between(string, start, end=None): ### Link Helpers def merge_links(a, b): - """deterministially merge two links, favoring longer field values over shorter, + """deterministically merge two links, favoring longer field values over shorter, and "cleaner" values over worse ones. """ longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key]) From 34270b2b1239b948d9598b7bb6ea8b31131066b8 Mon Sep 17 00:00:00 2001 From: Pig Monkey Date: Tue, 30 Apr 2019 17:25:41 -0700 Subject: [PATCH 013/333] only use stdin if it has a value Closes #228 --- archivebox/archive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/archive.py b/archivebox/archive.py index 5c0d195d..3e553e6e 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -86,8 +86,8 @@ def main(*args): ) print_help() raise SystemExit(1) - - import_path = save_stdin_source(stdin_raw_text) + if stdin_raw_text: + import_path = save_stdin_source(stdin_raw_text) ### Handle ingesting urls from a remote file/feed # (e.g. if an RSS feed URL is used as the import path) From 500534f4be87e94f05d9cf6063babd4faa5145cc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 15:17:16 -0400 Subject: [PATCH 014/333] fix missing comma in staticfile extensions list --- archivebox/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index 3c08c9bb..6f63b53f 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -70,7 +70,7 @@ STATICFILE_EXTENSIONS = { # that can be downloaded as-is, not html pages that need to be rendered 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', - 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8' + 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'atom', 'rss', 'css', 'js', 'json', 'dmg', 'iso', 'img', From 050cd9c8616cae31e388ecb4a312e107decc1f57 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 15:28:55 -0400 Subject: [PATCH 015/333] add license to manifest --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index ddb780e6..9100b772 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ +include LICENSE include archivebox/VERSION graft archivebox/themes graft archivebox/themes/static From 2440c1c1bf5901dc058baee0a9aeac78f2babcc8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 15:30:42 -0400 Subject: [PATCH 016/333] just use simple version instead of git hash --- setup.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/setup.py b/setup.py index adca4887..310c9691 100644 --- a/setup.py +++ b/setup.py @@ -8,19 +8,10 @@ with open("README.md", "r") as fh: script_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__))) VERSION = open(os.path.join(script_dir, 'archivebox', 'VERSION'), 'r').read().strip() -try: - GIT_HEAD = open(os.path.join(script_dir, '.git', 'HEAD'), 'r').read().strip().split(': ')[1] - GIT_SHA = open(os.path.join(script_dir, '.git', GIT_HEAD), 'r').read().strip()[:9] - PYPI_VERSION = "{}+{}".format(VERSION, GIT_SHA) -except: - PYPI_VERSION = VERSION - -with open(os.path.join(script_dir, 'archivebox', 'VERSION'), 'w+') as f: - f.write(PYPI_VERSION) setuptools.setup( name="archivebox", - version=PYPI_VERSION, + version=VERSION, author="Nick Sweeting", author_email="git@nicksweeting.com", description="The self-hosted internet archive.", From 7ee837c12784e196865a06ce09b84a91f1c1274d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 15:32:18 -0400 Subject: [PATCH 017/333] add twine to packages --- Pipfile | 1 + Pipfile.lock | 117 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 84 insertions(+), 34 deletions(-) diff --git a/Pipfile b/Pipfile index 194f81db..7c7e05ce 100644 --- a/Pipfile +++ b/Pipfile @@ -12,6 +12,7 @@ setuptools = "*" sphinx = "*" recommonmark = "*" sphinx-rtd-theme = "*" +twine = "*" [packages] dataclasses = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 663654b1..64a9bae2 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "8ac4f9e5cd266406a861a283b321b9eee0ca469638f838e93467403ef2f0594d" + "sha256": "5a1618caef76ff53b66c5e8674d8e639d25f75068f7026ad799e217d307628fc" }, "pipfile-spec": 6, "requires": { @@ -64,11 +64,11 @@ }, "django": { "hashes": [ - "sha256:7c3543e4fb070d14e10926189a7fcf42ba919263b7473dceaefce34d54e8a119", - "sha256:a2814bffd1f007805b19194eb0b9a331933b82bd5da1c3ba3d7b7ba16e06dc4b" + "sha256:6fcc3cbd55b16f9a01f37de8bcbe286e0ea22e87096557f1511051780338eaea", + "sha256:bb407d0bb46395ca1241f829f5bd03f7e482f97f7d1936e26e98dacb201ed4ec" ], "index": "pypi", - "version": "==2.2" + "version": "==2.2.1" }, "django-extensions": { "hashes": [ @@ -203,11 +203,11 @@ }, "youtube-dl": { "hashes": [ - "sha256:46f6e30c673ba71de84748dad4c264d1b6fb30beebf1ef834846a651b4524a78", - "sha256:b20d110e1bed8d16f5771bb938ab6e5da67f08af62b599af65301cca290f2e15" + "sha256:31844229a4f4d7003e03ab309ff2caff1b16ce0acbd3cfb7a13276058af13056", + "sha256:a751bd293e2d7ee963910de14b3eb95b88837021899be488fade0b8abe815650" ], "index": "pypi", - "version": "==2019.4.24" + "version": "==2019.4.30" } }, "develop": { @@ -240,6 +240,13 @@ ], "version": "==0.1.0" }, + "bleach": { + "hashes": [ + "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", + "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa" + ], + "version": "==3.1.0" + }, "certifi": { "hashes": [ "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", @@ -256,10 +263,10 @@ }, "commonmark": { "hashes": [ - "sha256:9f6dda7876b2bb88dd784440166f4bc8e56cb2b2551264051123bacb0b6c1d8a", - "sha256:abcbc854e0eae5deaf52ae5e328501b78b4a0758bf98ac8bb792fce993006084" + "sha256:14c3df31e8c9c463377e287b2a1eefaa6019ab97b22dad36e2f32be59d61d68d", + "sha256:867fc5db078ede373ab811e16b6789e9d033b15ccd7296f370ca52d1ee792ce0" ], - "version": "==0.8.1" + "version": "==0.9.0" }, "decorator": { "hashes": [ @@ -449,6 +456,13 @@ ], "version": "==0.7.5" }, + "pkginfo": { + "hashes": [ + "sha256:7424f2c8511c186cd5424bbf31045b77435b37a8d604990b79d4e70d741148bb", + "sha256:a6d9e40ca61ad3ebd0b72fbadd4fba16e4c0e4df0428c041e01e06eb6ee71f32" + ], + "version": "==1.5.0.1" + }, "prompt-toolkit": { "hashes": [ "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", @@ -499,6 +513,13 @@ ], "version": "==2019.1" }, + "readme-renderer": { + "hashes": [ + "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f", + "sha256:c8532b79afc0375a85f10433eca157d6b50f7d6990f337fa498c96cd4bfc203d" + ], + "version": "==24.0" + }, "recommonmark": { "hashes": [ "sha256:a520b8d25071a51ae23a27cf6252f2fe387f51bdc913390d83b2b50617f5bb48", @@ -514,6 +535,13 @@ ], "version": "==2.21.0" }, + "requests-toolbelt": { + "hashes": [ + "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f", + "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0" + ], + "version": "==0.9.1" + }, "six": { "hashes": [ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", @@ -586,6 +614,13 @@ ], "version": "==1.1.3" }, + "tqdm": { + "hashes": [ + "sha256:d385c95361699e5cf7622485d9b9eae2d4864b21cd5a2374a9c381ffed701021", + "sha256:e22977e3ebe961f72362f6ddfb9197cc531c9737aaf5f607ef09740c849ecd05" + ], + "version": "==4.31.1" + }, "traitlets": { "hashes": [ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", @@ -593,30 +628,37 @@ ], "version": "==4.3.2" }, + "twine": { + "hashes": [ + "sha256:0fb0bfa3df4f62076cab5def36b1a71a2e4acb4d1fa5c97475b048117b1a6446", + "sha256:d6c29c933ecfc74e9b1d9fa13aa1f87c5d5770e119f5a4ce032092f0ff5b14dc" + ], + "index": "pypi", + "version": "==1.13.0" + }, "typed-ast": { "hashes": [ - "sha256:04894d268ba6eab7e093d43107869ad49e7b5ef40d1a94243ea49b352061b200", - "sha256:16616ece19daddc586e499a3d2f560302c11f122b9c692bc216e821ae32aa0d0", - "sha256:252fdae740964b2d3cdfb3f84dcb4d6247a48a6abe2579e8029ab3be3cdc026c", - "sha256:2af80a373af123d0b9f44941a46df67ef0ff7a60f95872412a145f4500a7fc99", - "sha256:2c88d0a913229a06282b285f42a31e063c3bf9071ff65c5ea4c12acb6977c6a7", - "sha256:2ea99c029ebd4b5a308d915cc7fb95b8e1201d60b065450d5d26deb65d3f2bc1", - "sha256:3d2e3ab175fc097d2a51c7a0d3fda442f35ebcc93bb1d7bd9b95ad893e44c04d", - "sha256:4766dd695548a15ee766927bf883fb90c6ac8321be5a60c141f18628fb7f8da8", - "sha256:56b6978798502ef66625a2e0f80cf923da64e328da8bbe16c1ff928c70c873de", - "sha256:5cddb6f8bce14325b2863f9d5ac5c51e07b71b462361fd815d1d7706d3a9d682", - "sha256:644ee788222d81555af543b70a1098f2025db38eaa99226f3a75a6854924d4db", - "sha256:64cf762049fc4775efe6b27161467e76d0ba145862802a65eefc8879086fc6f8", - "sha256:68c362848d9fb71d3c3e5f43c09974a0ae319144634e7a47db62f0f2a54a7fa7", - "sha256:6c1f3c6f6635e611d58e467bf4371883568f0de9ccc4606f17048142dec14a1f", - "sha256:b213d4a02eec4ddf622f4d2fbc539f062af3788d1f332f028a2e19c42da53f15", - "sha256:bb27d4e7805a7de0e35bd0cb1411bc85f807968b2b0539597a49a23b00a622ae", - "sha256:c9d414512eaa417aadae7758bc118868cd2396b0e6138c1dd4fda96679c079d3", - "sha256:f0937165d1e25477b01081c4763d2d9cdc3b18af69cb259dd4f640c9b900fe5e", - "sha256:fb96a6e2c11059ecf84e6741a319f93f683e440e341d4489c9b161eca251cf2a", - "sha256:fc71d2d6ae56a091a8d94f33ec9d0f2001d1cb1db423d8b4355debfe9ce689b7" + "sha256:132eae51d6ef3ff4a8c47c393a4ef5ebf0d1aecc96880eb5d6c8ceab7017cc9b", + "sha256:18141c1484ab8784006c839be8b985cfc82a2e9725837b0ecfa0203f71c4e39d", + "sha256:2baf617f5bbbfe73fd8846463f5aeafc912b5ee247f410700245d68525ec584a", + "sha256:3d90063f2cbbe39177e9b4d888e45777012652d6110156845b828908c51ae462", + "sha256:4304b2218b842d610aa1a1d87e1dc9559597969acc62ce717ee4dfeaa44d7eee", + "sha256:4983ede548ffc3541bae49a82675996497348e55bafd1554dc4e4a5d6eda541a", + "sha256:5315f4509c1476718a4825f45a203b82d7fdf2a6f5f0c8f166435975b1c9f7d4", + "sha256:6cdfb1b49d5345f7c2b90d638822d16ba62dc82f7616e9b4caa10b72f3f16649", + "sha256:7b325f12635598c604690efd7a0197d0b94b7d7778498e76e0710cd582fd1c7a", + "sha256:8d3b0e3b8626615826f9a626548057c5275a9733512b137984a68ba1598d3d2f", + "sha256:8f8631160c79f53081bd23446525db0bc4c5616f78d04021e6e434b286493fd7", + "sha256:912de10965f3dc89da23936f1cc4ed60764f712e5fa603a09dd904f88c996760", + "sha256:b010c07b975fe853c65d7bbe9d4ac62f1c69086750a574f6292597763781ba18", + "sha256:c908c10505904c48081a5415a1e295d8403e353e0c14c42b6d67f8f97fae6616", + "sha256:c94dd3807c0c0610f7c76f078119f4ea48235a953512752b9175f9f98f5ae2bd", + "sha256:ce65dee7594a84c466e79d7fb7d3303e7295d16a83c22c7c4037071b059e2c21", + "sha256:eaa9cfcb221a8a4c2889be6f93da141ac777eb8819f077e1d09fb12d00a09a93", + "sha256:f3376bc31bad66d46d44b4e6522c5c21976bf9bca4ef5987bb2bf727f4506cbb", + "sha256:f9202fa138544e13a4ec1a6792c35834250a85958fde1251b6a22e07d1260ae7" ], - "version": "==1.3.4" + "version": "==1.3.5" }, "typing-extensions": { "hashes": [ @@ -628,10 +670,10 @@ }, "urllib3": { "hashes": [ - "sha256:4c291ca23bbb55c76518905869ef34bdd5f0e46af7afe6861e8375643ffee1a0", - "sha256:9a247273df709c4fedb38c711e44292304f73f39ab01beda9f6b9fc375669ac3" + "sha256:2393a695cd12afedd0dcb26fe5d50d0cf248e5a66f75dbd89a3d4eb333a61af4", + "sha256:a637e5fae88995b256e3409dc4d52c2e2e0ba32c42a6365fee8bbd2238de3cfb" ], - "version": "==1.24.2" + "version": "==1.24.3" }, "wcwidth": { "hashes": [ @@ -639,6 +681,13 @@ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" ], "version": "==0.1.7" + }, + "webencodings": { + "hashes": [ + "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", + "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" + ], + "version": "==0.5.1" } } } From ef77a6d43f69a60d4d29cc74e61833a3fd7b39c4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 15:39:55 -0400 Subject: [PATCH 018/333] add some missing fields to setup.py --- setup.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 310c9691..32809c76 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,15 @@ import os import setuptools -with open("README.md", "r") as fh: - long_description = fh.read() +BASE_DIR = os.path.abspath(os.path.dirname(os.path.abspath(__file__))) +PYTHON_DIR = os.path.join(BASE_DIR, 'archivebox') +with open('README.md', "r") as f: + README = f.read() -script_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__))) +with open(os.path.join(PYTHON_DIR, 'VERSION'), 'r') as f: + VERSION = f.read().strip() -VERSION = open(os.path.join(script_dir, 'archivebox', 'VERSION'), 'r').read().strip() setuptools.setup( name="archivebox", @@ -15,9 +17,10 @@ setuptools.setup( author="Nick Sweeting", author_email="git@nicksweeting.com", description="The self-hosted internet archive.", - long_description=long_description, + long_description=README, long_description_content_type="text/markdown", url="https://github.com/pirate/ArchiveBox", + license='MIT', project_urls={ 'Documentation': 'https://github.com/pirate/ArchiveBox/Wiki', 'Community': 'https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community', @@ -27,7 +30,7 @@ setuptools.setup( 'Changelog': 'https://github.com/pirate/ArchiveBox/wiki/Changelog', 'Patreon': 'https://github.com/pirate/ArchiveBox/wiki/Donations', }, - packages=setuptools.find_packages(), + packages=['archivebox',], python_requires='>=3.6', install_requires=[ "dataclasses==0.6", From ba21ff46f3b65809f47a5b37920cc8dbe402355d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 16:10:34 -0400 Subject: [PATCH 019/333] reverse the url order --- setup.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 32809c76..723aeaa1 100644 --- a/setup.py +++ b/setup.py @@ -22,15 +22,15 @@ setuptools.setup( url="https://github.com/pirate/ArchiveBox", license='MIT', project_urls={ - 'Documentation': 'https://github.com/pirate/ArchiveBox/Wiki', - 'Community': 'https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community', - 'Source': 'https://github.com/pirate/ArchiveBox', - 'Bug Tracker': 'https://github.com/pirate/ArchiveBox/issues', - 'Roadmap': 'https://github.com/pirate/ArchiveBox/wiki/Roadmap', + 'Donate': 'https://github.com/pirate/ArchiveBox/wiki/Donations', 'Changelog': 'https://github.com/pirate/ArchiveBox/wiki/Changelog', - 'Patreon': 'https://github.com/pirate/ArchiveBox/wiki/Donations', + 'Roadmap': 'https://github.com/pirate/ArchiveBox/wiki/Roadmap', + 'Bug Tracker': 'https://github.com/pirate/ArchiveBox/issues', + 'Source': 'https://github.com/pirate/ArchiveBox', + 'Community': 'https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community', + 'Documentation': 'https://github.com/pirate/ArchiveBox/Wiki', }, - packages=['archivebox',], + packages=setuptools.find_packages(), python_requires='>=3.6', install_requires=[ "dataclasses==0.6", From e0489d77e71a60f2a66d2ede9f774d8fa0eea632 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 17:39:45 -0400 Subject: [PATCH 020/333] bump the version --- archivebox/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/VERSION b/archivebox/VERSION index 1d0ba9ea..267577d4 100644 --- a/archivebox/VERSION +++ b/archivebox/VERSION @@ -1 +1 @@ -0.4.0 +0.4.1 From d398bd59b017a28fd3911e32550c5d20ec1f09c2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 17:51:21 -0400 Subject: [PATCH 021/333] switch to pure Manifest-based package includes --- MANIFEST.in | 9 ++------- setup.py | 14 +------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 9100b772..a73ef711 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,4 @@ include LICENSE +include README.md include archivebox/VERSION -graft archivebox/themes -graft archivebox/themes/static -graft archivebox/themes/admin -graft archivebox/themes/default -graft archivebox/themes/default/static -graft archivebox/themes/legacy -graft archivebox/themes/legacy/static +recursive-include archivebox/themes * diff --git a/setup.py b/setup.py index 723aeaa1..f23ae7b5 100644 --- a/setup.py +++ b/setup.py @@ -55,19 +55,7 @@ setuptools.setup( 'archivebox = archivebox.__main__:main', ], }, - package_data={ - 'archivebox': [ - # Manifest.ini must correspond 1:1 with this list - 'VERSION', - 'themes/*', - 'themes/static/*', - 'themes/admin/*' - 'themes/default/*' - 'themes/default/static/*' - 'themes/legacy/*', - 'themes/legacy/static/*', - ], - }, + include_package_data=True, classifiers=[ "Development Status :: 4 - Beta", From d016f1efb50a6ba13b84a05754fd92e631e90346 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 17:52:43 -0400 Subject: [PATCH 022/333] bump version --- archivebox/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/VERSION b/archivebox/VERSION index 267577d4..2b7c5ae0 100644 --- a/archivebox/VERSION +++ b/archivebox/VERSION @@ -1 +1 @@ -0.4.1 +0.4.2 From 3c3b2ee62167c499f7f2a047b9d635a28a58544a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 May 2019 19:15:16 -0400 Subject: [PATCH 023/333] expose more django server config options --- archivebox/config/__init__.py | 29 ++++++++++++--- archivebox/config/stubs.py | 9 ++++- archivebox/core/settings.py | 43 ++++++++++------------- archivebox/core/urls.py | 6 ++++ archivebox/core/views.py | 18 +++++++++- archivebox/main.py | 6 ++-- archivebox/themes/default/main_index.html | 2 +- etc/ArchiveBox.conf.default | 13 +++++-- 8 files changed, 89 insertions(+), 37 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 72baec64..04b8515c 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -44,10 +44,19 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'TIMEOUT': {'type': int, 'default': 60}, 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'}, - 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, 'URL_BLACKLIST': {'type': str, 'default': None}, }, + 'SERVER_CONFIG': { + 'SECRET_KEY': {'type': str, 'default': None}, + 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, + 'DEBUG': {'type': bool, 'default': False}, + 'PUBLIC_INDEX': {'type': bool, 'default': True}, + 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, + 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, + 'ACTIVE_THEME': {'type': str, 'default': 'default'}, + }, + 'ARCHIVE_METHOD_TOGGLES': { 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)}, 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, @@ -313,9 +322,6 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: with open(config_path, 'w+') as f: f.write(CONFIG_HEADER) - if not config: - return {} - config_file = ConfigParser() config_file.optionxform = str config_file.read(config_path) @@ -336,6 +342,21 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: config_file[section] = {**existing_config, key: val} + # always make sure there's a SECRET_KEY defined for Django + existing_secret_key = None + if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']: + existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY'] + + if (not existing_secret_key) or ('not a valid secret' in existing_secret_key): + from django.utils.crypto import get_random_string + chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.' + random_secret_key = get_random_string(50, chars) + if 'SERVER_CONFIG' in config_file: + config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key + else: + config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key} + + f.write(CONFIG_HEADER) config_file.write(f) try: diff --git a/archivebox/config/stubs.py b/archivebox/config/stubs.py index f7d5059a..7d3925dd 100644 --- a/archivebox/config/stubs.py +++ b/archivebox/config/stubs.py @@ -22,9 +22,16 @@ class ConfigDict(BaseConfig, total=False): TIMEOUT: int MEDIA_TIMEOUT: int OUTPUT_PERMISSIONS: str - FOOTER_INFO: str URL_BLACKLIST: Optional[str] + SECRET_KEY: str + ALLOWED_HOSTS: str + DEBUG: bool + PUBLIC_INDEX: bool + PUBLIC_SNAPSHOTS: bool + FOOTER_INFO: str + ACTIVE_THEME: str + SAVE_TITLE: bool SAVE_FAVICON: bool SAVE_WGET: bool diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index e128f8d0..463a7079 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -3,26 +3,25 @@ __package__ = 'archivebox.core' import os import sys -SECRET_KEY = '---------------- not a valid secret key ! ----------------' -DEBUG = os.getenv('DEBUG', 'False').lower() == 'true' -ALLOWED_HOSTS = ['*'] -REPO_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), os.path.pardir, os.path.pardir)) -OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir)) -ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive') -DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3') +from ..config import ( + OUTPUT_DIR, + SECRET_KEY, + DEBUG, + ALLOWED_HOSTS, + PYTHON_DIR, + ACTIVE_THEME, + SQL_INDEX_FILENAME, +) -ACTIVE_THEME = 'default' +ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] -APPEND_SLASH = True - INSTALLED_APPS = [ 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', - # 'django.contrib.sites', 'django.contrib.messages', 'django.contrib.admin', 'django.contrib.staticfiles', @@ -40,17 +39,17 @@ MIDDLEWARE = [ 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', - # 'django.middleware.clickjacking.XFrameOptionsMiddleware', ] ROOT_URLCONF = 'core.urls' +APPEND_SLASH = True TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', 'DIRS': [ - os.path.join(REPO_DIR, 'themes', ACTIVE_THEME), - os.path.join(REPO_DIR, 'themes', 'default'), - os.path.join(REPO_DIR, 'themes'), + os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME), + os.path.join(PYTHON_DIR, 'themes', 'default'), + os.path.join(PYTHON_DIR, 'themes'), ], 'APP_DIRS': True, 'OPTIONS': { @@ -69,7 +68,7 @@ WSGI_APPLICATION = 'core.wsgi.application' DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': DATABASE_FILE, + 'NAME': os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME), } } @@ -104,7 +103,7 @@ SHELL_PLUS_PRINT_SQL = False IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' if IS_SHELL: - os.environ['PYTHONSTARTUP'] = os.path.join(REPO_DIR, 'core', 'welcome_message.py') + os.environ['PYTHONSTARTUP'] = os.path.join(PYTHON_DIR, 'core', 'welcome_message.py') LANGUAGE_CODE = 'en-us' @@ -118,11 +117,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' STATIC_URL = '/static/' STATICFILES_DIRS = [ - os.path.join(REPO_DIR, 'themes', ACTIVE_THEME, 'static'), - os.path.join(REPO_DIR, 'themes', 'default', 'static'), - os.path.join(REPO_DIR, 'themes', 'static'), + os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME, 'static'), + os.path.join(PYTHON_DIR, 'themes', 'default', 'static'), + os.path.join(PYTHON_DIR, 'themes', 'static'), ] - -SERVE_STATIC = True - - diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 2a001f6b..9b4af5a5 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -22,8 +22,14 @@ urlpatterns = [ path('add/', AddLinks.as_view(), name='AddLinks'), path('static/', views.serve), + + path('accounts/login/', RedirectView.as_view(url='/admin/login/')), + path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')), + path('accounts/', include('django.contrib.auth.urls')), path('admin/', admin.site.urls), + + path('', MainIndex.as_view(), name='Home'), ] diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 2c140d58..7fee7408 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -4,11 +4,18 @@ from django.shortcuts import render, redirect from django.http import HttpResponse from django.views import View, static +from django.conf import settings from core.models import Snapshot from ..index import load_main_index, load_main_index_meta -from ..config import OUTPUT_DIR, VERSION, FOOTER_INFO +from ..config import ( + OUTPUT_DIR, + VERSION, + FOOTER_INFO, + PUBLIC_INDEX, + PUBLIC_SNAPSHOTS, +) from ..util import base_url @@ -16,6 +23,9 @@ class MainIndex(View): template = 'main_index.html' def get(self, request): + if not request.user.is_authenticated and not PUBLIC_INDEX: + return redirect(f'/admin/login/?next={request.path}') + all_links = load_main_index(out_dir=OUTPUT_DIR) meta_info = load_main_index_meta(out_dir=OUTPUT_DIR) @@ -34,6 +44,9 @@ class AddLinks(View): template = 'add_links.html' def get(self, request): + if not request.user.is_authenticated and not PUBLIC_INDEX: + return redirect(f'/admin/login/?next={request.path}') + context = {} return render(template_name=self.template, request=request, context=context) @@ -54,6 +67,9 @@ class LinkDetails(View): if '/' not in path: return redirect(f'{path}/index.html') + if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: + return redirect(f'/admin/login/?next={request.path}') + try: slug, archivefile = path.split('/', 1) except (IndexError, ValueError): diff --git a/archivebox/main.py b/archivebox/main.py index 00529743..80e4b77b 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -292,14 +292,14 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: setup_django(out_dir, check_db=False) from django.conf import settings - assert settings.DATABASE_FILE == os.path.join(out_dir, SQL_INDEX_FILENAME) - print(f' √ {settings.DATABASE_FILE}') + DATABASE_FILE = os.path.join(out_dir, SQL_INDEX_FILENAME) + print(f' √ {DATABASE_FILE}') print() for migration_line in apply_migrations(out_dir): print(f' {migration_line}') - assert os.path.exists(settings.DATABASE_FILE) + assert os.path.exists(DATABASE_FILE) # from django.contrib.auth.models import User # if IS_TTY and not User.objects.filter(is_superuser=True).exists(): diff --git a/archivebox/themes/default/main_index.html b/archivebox/themes/default/main_index.html index 4ad00be7..925c4fa4 100644 --- a/archivebox/themes/default/main_index.html +++ b/archivebox/themes/default/main_index.html @@ -190,7 +190,7 @@
Add Links   |   - Admin   |   + Admin   |   Docs
diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 31017ce4..df9abe22 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -1,6 +1,6 @@ # This is the example default configiration file for ArchiveBox. # -# Copy example config from here into your project's ArchiveBox.conf file, +# Copy lines from here into your project's ArchiveBox.conf file and uncomment, # DO NOT EDIT THIS FILE DIRECTLY! # # See the list of all the possible options. documentation, and examples here: @@ -11,10 +11,17 @@ # ONLY_NEW = False # TIMEOUT = 60 # MEDIA_TIMEOUT = 3600 -# ACTIVE_THEME = default -# FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests. # URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$) +[SERVER_CONFIG] +# SECRET_KEY = ---------------- not a valid secret key ! ---------------- +# DEBUG = False +# PUBLIC_INDEX = True +# PUBLIC_SNAPSHOTS = True +# FOOTER_INFO = Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests. +# ACTIVE_THEME = default + + [ARCHIVE_METHOD_TOGGLES] # SAVE_TITLE = True # SAVE_FAVICON = True From ca9c9ef956e9ecc7e4f3b07ed6ea74cd434ebb68 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 6 May 2019 17:16:20 -0400 Subject: [PATCH 024/333] add warning about running manage.py directly --- archivebox/manage.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/archivebox/manage.py b/archivebox/manage.py index 52c21895..3976c2c2 100755 --- a/archivebox/manage.py +++ b/archivebox/manage.py @@ -3,6 +3,20 @@ import os import sys if __name__ == '__main__': + # if you're a developer working on archivebox, still prefer the archivebox + # versions of ./manage.py commands whenever possible. When that's not possible + # (e.g. makemigrations), you can comment out this check temporarily + + print("[X] Don't run ./manage.py directly, use the archivebox CLI instead e.g.:") + print(' archivebox manage createsuperuser') + print() + print(' Hint: Use these archivebox commands instead of the ./manage.py equivalents:') + print(' archivebox init (migrates the databse to latest version)') + print(' archivebox server (runs the Django web server)') + print(' archivebox shell (opens an iPython Django shell with all models imported)') + print(' archivebox manage [cmd] (any other management commands)') + raise SystemExit(2) + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') try: from django.core.management import execute_from_command_line From 3b0236b087defc2e73e8f0301c016ce6efbd0b01 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Tue, 14 May 2019 23:54:17 +0100 Subject: [PATCH 025/333] Add prefers-color-scheme: dark support --- archivebox/templates/index.html | 46 +++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/archivebox/templates/index.html b/archivebox/templates/index.html index 264deb4d..dd2e16cd 100644 --- a/archivebox/templates/index.html +++ b/archivebox/templates/index.html @@ -3,6 +3,34 @@ Archived Sites - - - - - - - -
-
- -
-
-
-

-
{% csrf_token %} - Add new links...
-
- -
-
- - + tr td a.favicon img { + padding-left: 6px; + padding-right: 12px; + vertical-align: -4px; + } + tr td a.title { + font-size: 1.4em; + text-decoration: none; + color: black; + } + tr td a.title small { + background-color: #efefef; + border-radius: 4px; + float: right; + } + input[type="search"]::-webkit-search-cancel-button { + -webkit-appearance: searchfield-cancel-button; + } + .title-col { + text-align: left; + } + .title-col a { + color: black; + } + + + + + + + + +
+
+ +
+
+
+ {{ stdout | safe }} +

+
+ {% csrf_token %} Add new links...
+
+ +
+ + Go back to Snapshot list +
+ diff --git a/archivebox/util.py b/archivebox/util.py index 87c98263..50511313 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -20,6 +20,7 @@ from .config import ( CHECK_SSL_VALIDITY, WGET_USER_AGENT, CHROME_OPTIONS, + COLOR_DICT ) try: @@ -69,6 +70,8 @@ URL_REGEX = re.compile( re.IGNORECASE, ) +COLOR_REGEX = re.compile(r'\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m') + def enforce_types(func): """ @@ -195,6 +198,27 @@ def chrome_args(**options) -> List[str]: return cmd_args +def ansi_to_html(text): + """ + Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html + """ + TEMPLATE = '
' + text = text.replace('[m', '
') + + def single_sub(match): + argsdict = match.groupdict() + if argsdict['arg_3'] is None: + if argsdict['arg_2'] is None: + bold, color = 0, argsdict['arg_1'] + else: + bold, color = argsdict['arg_1'], argsdict['arg_2'] + else: + bold, color = argsdict['arg_3'], argsdict['arg_2'] + + return TEMPLATE.format(COLOR_DICT[color][0]) + + return COLOR_REGEX.sub(single_sub, text) + class ExtendedEncoder(pyjson.JSONEncoder): """ From 364c5752d827c87a927bed00e89e4e3d7c5b6e4a Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 1 Jul 2020 12:29:56 -0500 Subject: [PATCH 117/333] feat: Handle empty URL case --- archivebox/core/views.py | 27 +- archivebox/themes/default/add_links.html | 426 +++++++++++------------ 2 files changed, 218 insertions(+), 235 deletions(-) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index b7911674..5efa79cd 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -57,19 +57,22 @@ class AddLinks(View): def post(self, request): url = request.POST['url'] - print(f'[+] Adding URL: {url}') - add_stdout = StringIO() - with redirect_stdout(add_stdout): - extracted_links = add( - import_str=url, - update_all=False, - out_dir=OUTPUT_DIR, - ) - print(add_stdout.getvalue()) + if url: + print(f'[+] Adding URL: {url}') + add_stdout = StringIO() + with redirect_stdout(add_stdout): + extracted_links = add( + import_str=url, + update_all=False, + out_dir=OUTPUT_DIR, + ) + print(add_stdout.getvalue()) - context = { - "stdout": ansi_to_html(add_stdout.getvalue()) - } + context = { + "stdout": ansi_to_html(add_stdout.getvalue()) + } + else: + context = {"stdout": "Please enter a URL"} return render(template_name=self.template, request=request, context=context) diff --git a/archivebox/themes/default/add_links.html b/archivebox/themes/default/add_links.html index db09322f..6c625594 100644 --- a/archivebox/themes/default/add_links.html +++ b/archivebox/themes/default/add_links.html @@ -2,231 +2,211 @@ - - Archived Sites - - - - - - - - - -
-
- -
-
-
- {{ stdout | safe }} -

-
- {% csrf_token %} Add new links...
-
- -
+ tr td a.favicon img { + padding-left: 6px; + padding-right: 12px; + vertical-align: -4px; + } + tr td a.title { + font-size: 1.4em; + text-decoration:none; + color:black; + } + tr td a.title small { + background-color: #efefef; + border-radius: 4px; + float:right + } + input[type=search]::-webkit-search-cancel-button { + -webkit-appearance: searchfield-cancel-button; + } + .title-col { + text-align: left; + } + .title-col a { + color: black; + } + + + + + + + + +
+
+ +
+
+
+ {{ stdout | safe }} +

+
{% csrf_token %} + Add new links...
+
+ +
+
- Go back to Snapshot list -
- + Go back to Snapshot list + + From 8840ad72bbc2006c9e02690b814b6524679ef79f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 Jul 2020 03:12:30 -0400 Subject: [PATCH 118/333] remove circular import possibilities --- archivebox/config/__init__.py | 8 ++++++++ archivebox/core/admin.py | 2 +- archivebox/util.py | 25 ++++++++++++++----------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index fa979211..f06b0f3d 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -21,6 +21,14 @@ from .stubs import ( ConfigDefaultDict, ) +# precedence order for config: +# 1. cli args +# 2. shell environment vars +# 3. config file +# 4. defaults + +# env USE_COLO=false archivebox add '...' +# env SHOW_PROGRESS=1 archivebox add '...' # ****************************************************************************** # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 5cf71796..7942c6c2 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,7 +1,7 @@ from django.contrib import admin from django.utils.html import format_html -from archivebox.util import htmldecode, urldecode +from util import htmldecode, urldecode from core.models import Snapshot from cli.logging import printable_filesize diff --git a/archivebox/util.py b/archivebox/util.py index 50511313..717e1185 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -14,15 +14,6 @@ from dateutil import parser as dateparser import requests from base32_crockford import encode as base32_encode # type: ignore -from .config import ( - TIMEOUT, - STATICFILE_EXTENSIONS, - CHECK_SSL_VALIDITY, - WGET_USER_AGENT, - CHROME_OPTIONS, - COLOR_DICT -) - try: import chardet detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"] @@ -49,7 +40,6 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links without_www = lambda url: url.replace('://www.', '://', 1) without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?') hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20] -is_static_file = lambda url: extension(url).lower() in STATICFILE_EXTENSIONS # TODO: the proper way is with MIME type detection, not using extension urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace') urldecode = lambda s: s and unquote(s) @@ -70,7 +60,14 @@ URL_REGEX = re.compile( re.IGNORECASE, ) +<<<<<<< HEAD COLOR_REGEX = re.compile(r'\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m') +======= +def is_static_file(url: str): + # TODO: the proper way is with MIME type detection + ext, not only extension + from .config import STATICFILE_EXTENSIONS + return extension(url).lower() in STATICFILE_EXTENSIONS +>>>>>>> c1fe068... remove circular import possibilities def enforce_types(func): @@ -155,8 +152,10 @@ def parse_date(date: Any) -> Optional[datetime]: @enforce_types -def download_url(url: str, timeout: int=TIMEOUT) -> str: +def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" + from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT + timeout = timeout or TIMEOUT response = requests.get( url, headers={'User-Agent': WGET_USER_AGENT}, @@ -170,6 +169,8 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str: def chrome_args(**options) -> List[str]: """helper to build up a chrome shell command with arguments""" + from .config import CHROME_OPTIONS + options = {**CHROME_OPTIONS, **options} cmd_args = [options['CHROME_BINARY']] @@ -202,6 +203,8 @@ def ansi_to_html(text): """ Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html """ + from .config import COLOR_DICT + TEMPLATE = '
' text = text.replace('[m', '
') From 2ece5c20cfb11eff27078faa316aa4af075e5ad9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 Jul 2020 03:14:07 -0400 Subject: [PATCH 119/333] bump docs --- docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs b/docs index d6d43042..2061184e 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit d6d43042893a017e0d43723da0b9890422102554 +Subproject commit 2061184e3ea6a35d8e32cb4ca6d24a1afc06706f From 3ec97e55283ed88be6ea3df89266378dda5fe09f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 Jul 2020 03:22:37 -0400 Subject: [PATCH 120/333] fix git conflict commited by accident --- archivebox/util.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/archivebox/util.py b/archivebox/util.py index 717e1185..4ba1e3dd 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -60,14 +60,12 @@ URL_REGEX = re.compile( re.IGNORECASE, ) -<<<<<<< HEAD COLOR_REGEX = re.compile(r'\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m') -======= + def is_static_file(url: str): # TODO: the proper way is with MIME type detection + ext, not only extension from .config import STATICFILE_EXTENSIONS return extension(url).lower() in STATICFILE_EXTENSIONS ->>>>>>> c1fe068... remove circular import possibilities def enforce_types(func): @@ -204,7 +202,7 @@ def ansi_to_html(text): Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html """ from .config import COLOR_DICT - + TEMPLATE = '
' text = text.replace('[m', '
') From 322be6b29233eee1b77626aab78d9e43b76261b0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 Jul 2020 03:53:39 -0400 Subject: [PATCH 121/333] move main into cli init and remove circular import layer --- archivebox/__init__.py | 6 ---- archivebox/__main__.py | 9 ++---- archivebox/cli/__init__.py | 55 ++++++++++++++++++++++++++++++- archivebox/cli/archivebox.py | 63 ------------------------------------ setup.py | 11 +++---- 5 files changed, 61 insertions(+), 83 deletions(-) delete mode 100755 archivebox/cli/archivebox.py diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 56b6f16e..b0c00b61 100644 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,7 +1 @@ __package__ = 'archivebox' - -from . import core -from . import cli - -# The main CLI source code, is in 'archivebox/main.py' -from .main import * diff --git a/archivebox/__main__.py b/archivebox/__main__.py index 3386d46d..55e94415 100755 --- a/archivebox/__main__.py +++ b/archivebox/__main__.py @@ -3,13 +3,8 @@ __package__ = 'archivebox' import sys -from .cli import archivebox - - -def main(): - archivebox.main(args=sys.argv[1:], stdin=sys.stdin) +from .cli import main if __name__ == '__main__': - archivebox.main(args=sys.argv[1:], stdin=sys.stdin) - + main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 7972c02e..ece64f8b 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -1,8 +1,13 @@ __package__ = 'archivebox.cli' +__command__ = 'archivebox' import os +import argparse + +from typing import Optional, Dict, List, IO + +from ..config import OUTPUT_DIR -from typing import Dict, List, Optional, IO from importlib import import_module CLI_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -24,6 +29,7 @@ is_valid_cli_module = lambda module, subcommand: ( and module.__command__.split(' ')[-1] == subcommand ) + def list_subcommands() -> Dict[str, str]: """find and import all valid archivebox_.py files in CLI_DIR""" @@ -57,6 +63,53 @@ def run_subcommand(subcommand: str, SUBCOMMANDS = list_subcommands() + +def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: + subcommands = list_subcommands() + parser = argparse.ArgumentParser( + prog=__command__, + description='ArchiveBox: The self-hosted internet archive', + add_help=False, + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--help', '-h', + action='store_true', + help=subcommands['help'], + ) + group.add_argument( + '--version', + action='store_true', + help=subcommands['version'], + ) + group.add_argument( + "subcommand", + type=str, + help= "The name of the subcommand to run", + nargs='?', + choices=subcommands.keys(), + default=None, + ) + parser.add_argument( + "subcommand_args", + help="Arguments for the subcommand", + nargs=argparse.REMAINDER, + ) + command = parser.parse_args(args or ()) + + if command.help or command.subcommand is None: + command.subcommand = 'help' + if command.version: + command.subcommand = 'version' + + run_subcommand( + subcommand=command.subcommand, + subcommand_args=command.subcommand_args, + stdin=stdin, + pwd=pwd or OUTPUT_DIR, + ) + + __all__ = ( 'SUBCOMMANDS', 'list_subcommands', diff --git a/archivebox/cli/archivebox.py b/archivebox/cli/archivebox.py deleted file mode 100755 index c8281937..00000000 --- a/archivebox/cli/archivebox.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -# archivebox [command] - -__package__ = 'archivebox.cli' -__command__ = 'archivebox' - -import sys -import argparse - -from typing import Optional, List, IO - -from . import list_subcommands, run_subcommand -from ..config import OUTPUT_DIR - - -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - subcommands = list_subcommands() - parser = argparse.ArgumentParser( - prog=__command__, - description='ArchiveBox: The self-hosted internet archive', - add_help=False, - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--help', '-h', - action='store_true', - help=subcommands['help'], - ) - group.add_argument( - '--version', - action='store_true', - help=subcommands['version'], - ) - group.add_argument( - "subcommand", - type=str, - help= "The name of the subcommand to run", - nargs='?', - choices=subcommands.keys(), - default=None, - ) - parser.add_argument( - "subcommand_args", - help="Arguments for the subcommand", - nargs=argparse.REMAINDER, - ) - command = parser.parse_args(args or ()) - - if command.help or command.subcommand is None: - command.subcommand = 'help' - if command.version: - command.subcommand = 'version' - - run_subcommand( - subcommand=command.subcommand, - subcommand_args=command.subcommand_args, - stdin=stdin, - pwd=pwd or OUTPUT_DIR, - ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/setup.py b/setup.py index 8ac00c44..049528fb 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -import os import setuptools from pathlib import Path @@ -10,9 +9,9 @@ README = (BASE_DIR / "README.md").read_text() VERSION = (SOURCE_DIR / "VERSION").read_text().strip() # To see when setup.py gets called (uncomment for debugging) -import sys -print(SOURCE_DIR, f" (v{VERSION})") -print('>', sys.executable, *sys.argv) +# import sys +# print(SOURCE_DIR, f" (v{VERSION})") +# print('>', sys.executable, *sys.argv) # raise SystemExit(0) setuptools.setup( @@ -69,10 +68,10 @@ setuptools.setup( # 'redis': ['redis', 'django-redis'], # 'pywb': ['pywb', 'redis'], }, - packages=[PKG_NAME], + packages=setuptools.find_packages(), entry_points={ "console_scripts": [ - f"{PKG_NAME} = {PKG_NAME}.__main__:main", + f"{PKG_NAME} = {PKG_NAME}.cli:main", ], }, include_package_data=True, From 0c48449aa64c58fc350a40d39c3062e90e457a2d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 Jul 2020 04:00:51 -0400 Subject: [PATCH 122/333] fix subcommand and args not being passed --- archivebox/cli/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index ece64f8b..8d06855a 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -2,6 +2,7 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox' import os +import sys import argparse from typing import Optional, Dict, List, IO @@ -65,6 +66,7 @@ SUBCOMMANDS = list_subcommands() def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: + args = sys.argv[1:] if args is None else args subcommands = list_subcommands() parser = argparse.ArgumentParser( prog=__command__, From 528fc8f1f64bae28e54b416be5bb578dc2e38ccb Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 2 Jul 2020 12:11:23 -0500 Subject: [PATCH 123/333] fix: Improve encoding detection for rss+xml content types --- archivebox/util.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/archivebox/util.py b/archivebox/util.py index 4ba1e3dd..8fdda389 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -160,6 +160,15 @@ def download_url(url: str, timeout: int=None) -> str: verify=CHECK_SSL_VALIDITY, timeout=timeout, ) + if response.headers.get('Content-Type') == 'application/rss+xml': + # Based on https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py + _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' + _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') + _BODY_ENCODING_PATTERN = r'<\s*(\?xml\s[^>]+%s)' % (_XML_ENCODING_RE) + _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE) + match = _BODY_ENCODING_STR_RE.search(response.text[:1024]) + if match: + response.encoding = match.group('xmlcharset') return response.text From f373df7bd43ebe2c557f16c9e0c139975b63396c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 Jul 2020 13:23:40 -0400 Subject: [PATCH 124/333] update helptext to clarify adding links --- archivebox/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/main.py b/archivebox/main.py index a1aba118..f1fb98ce 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -377,11 +377,11 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None: else: print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI)) print() - print(' {lightred}Hint:{reset}To view your archive index, open:'.format(**ANSI)) - print(' {}'.format(os.path.join(out_dir, HTML_INDEX_FILENAME))) + print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI)) + print(' archivebox server # then visit http://127.0.0.1:8000') print() print(' To add new links, you can run:') - print(" archivebox add 'https://example.com'") + print(" archivebox add ~/some/path/or/url/to/list_of_links.txt") print() print(' For more usage and examples, run:') print(' archivebox help') From 7c428f40c8b74df85c6088ad7fcd5b62c4e10556 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 2 Jul 2020 13:31:05 -0400 Subject: [PATCH 125/333] fix stdin link importing --- archivebox/cli/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 8d06855a..087f11b5 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -64,9 +64,14 @@ def run_subcommand(subcommand: str, SUBCOMMANDS = list_subcommands() +class NotProvided: + pass + + +def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, pwd: Optional[str]=None) -> None: + args = sys.argv[1:] if args is NotProvided else args + stdin = sys.stdin if stdin is NotProvided else stdin -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - args = sys.argv[1:] if args is None else args subcommands = list_subcommands() parser = argparse.ArgumentParser( prog=__command__, From 8bdfa18a3f8eb10dfd05337f7c488d20bda31bcc Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 2 Jul 2020 15:54:25 -0500 Subject: [PATCH 126/333] feat: Allow feed loading from the add links view --- archivebox/core/forms.py | 7 +++++ archivebox/core/views.py | 33 +++++++++++++++++------- archivebox/themes/default/add_links.html | 10 +++++-- 3 files changed, 38 insertions(+), 12 deletions(-) create mode 100644 archivebox/core/forms.py diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py new file mode 100644 index 00000000..5f67e2c6 --- /dev/null +++ b/archivebox/core/forms.py @@ -0,0 +1,7 @@ +from django import forms + +CHOICES = (('url', 'URL'), ('feed', 'Feed')) + +class AddLinkForm(forms.Form): + url = forms.URLField() + source = forms.ChoiceField(choices=CHOICES, widget=forms.RadioSelect, initial='url') diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 5efa79cd..0c5efff2 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -22,6 +22,8 @@ from ..config import ( from ..util import base_url, ansi_to_html from .. main import add +from .forms import AddLinkForm + class MainIndex(View): template = 'main_index.html' @@ -51,28 +53,39 @@ class AddLinks(View): if not request.user.is_authenticated and not PUBLIC_INDEX: return redirect(f'/admin/login/?next={request.path}') - context = {} + context = { + "form": AddLinkForm() + } return render(template_name=self.template, request=request, context=context) def post(self, request): - url = request.POST['url'] - if url: + #url = request.POST['url'] + #if url: + form = AddLinkForm(request.POST) + if form.is_valid(): + url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') + if form.cleaned_data["source"] == "url": + key = "import_str" + else: + key = "import_path" + input_kwargs = { + key: url, + "update_all": False, + "out_dir": OUTPUT_DIR, + } add_stdout = StringIO() with redirect_stdout(add_stdout): - extracted_links = add( - import_str=url, - update_all=False, - out_dir=OUTPUT_DIR, - ) + extracted_links = add(**input_kwargs) print(add_stdout.getvalue()) context = { - "stdout": ansi_to_html(add_stdout.getvalue()) + "stdout": ansi_to_html(add_stdout.getvalue()), + "form": AddLinkForm() } else: - context = {"stdout": "Please enter a URL"} + context = {"form": form} return render(template_name=self.template, request=request, context=context) diff --git a/archivebox/themes/default/add_links.html b/archivebox/themes/default/add_links.html index 6c625594..7143c576 100644 --- a/archivebox/themes/default/add_links.html +++ b/archivebox/themes/default/add_links.html @@ -159,6 +159,12 @@ .title-col a { color: black; } + .ul-form { + list-style: none; + } + .ul-form li { + list-style: none; + } @@ -199,9 +205,9 @@
{{ stdout | safe }}

-
{% csrf_token %} + {% csrf_token %} Add new links...
-
+ {{ form.as_ul }}
From 63fe19e2c2d236cabae36ef441aff9fd46dd6014 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 3 Jul 2020 11:52:57 -0500 Subject: [PATCH 127/333] feat: Add pytest and initial tests --- setup.py | 3 +++ tests/test_init.py | 40 ++++++++++++++++++++++++++++++++++++++++ tests/test_util.py | 21 +++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 tests/test_init.py create mode 100644 tests/test_util.py diff --git a/setup.py b/setup.py index 049528fb..12002580 100755 --- a/setup.py +++ b/setup.py @@ -65,6 +65,9 @@ setuptools.setup( "sphinx-rtd-theme", "recommonmark", ], + "test": [ + "pytest" + ] # 'redis': ['redis', 'django-redis'], # 'pywb': ['pywb', 'redis'], }, diff --git a/tests/test_init.py b/tests/test_init.py new file mode 100644 index 00000000..b870a599 --- /dev/null +++ b/tests/test_init.py @@ -0,0 +1,40 @@ +# archivebox init +# archivebox add + +import os +import subprocess +from pathlib import Path +import json + +import pytest + +@pytest.fixture +def process(tmp_path): + os.chdir(tmp_path) + process = subprocess.run(['archivebox', 'init'], capture_output=True) + return process + + +def test_init(tmp_path, process): + assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8") + +def test_update(tmp_path, process): + os.chdir(tmp_path) + update_process = subprocess.run(['archivebox', 'init'], capture_output=True) + assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8") + +def test_add_link(tmp_path, process): + os.chdir(tmp_path) + add_process = subprocess.run(['archivebox', 'add', 'http://example.com'], capture_output=True) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + + assert "index.json" in [x.name for x in archived_item_path.iterdir()] + + with open(archived_item_path / "index.json", "r") as f: + output_json = json.load(f) + assert "IANA — IANA-managed Reserved Domains" == output_json['history']['title'][0]['output'] + + with open(tmp_path / "index.html", "r") as f: + output_html = f.read() + assert "IANA — IANA-managed Reserved Domains" in output_html + diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 00000000..19ed31c0 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,21 @@ +#@enforce_types +#def download_url(url: str, timeout: int=None) -> str: +# """Download the contents of a remote url and return the text""" +# from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT +# timeout = timeout or TIMEOUT +# response = requests.get( +# url, +# headers={'User-Agent': WGET_USER_AGENT}, +# verify=CHECK_SSL_VALIDITY, +# timeout=timeout, +# ) +# if response.headers.get('Content-Type') == 'application/rss+xml': +# # Based on https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py +# _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' +# _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') +# _BODY_ENCODING_PATTERN = r'<\s*(\?xml\s[^>]+%s)' % (_XML_ENCODING_RE) +# _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE) +# match = _BODY_ENCODING_STR_RE.search(response.text[:1024]) +# if match: +# response.encoding = match.group('xmlcharset') +# return response.text \ No newline at end of file From 438203f4cec49e92c49976d57788be6b188f173e Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 3 Jul 2020 12:54:21 -0500 Subject: [PATCH 128/333] test: add basic download_url test --- tests/test_util.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 19ed31c0..1497de5a 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,21 +1,5 @@ -#@enforce_types -#def download_url(url: str, timeout: int=None) -> str: -# """Download the contents of a remote url and return the text""" -# from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT -# timeout = timeout or TIMEOUT -# response = requests.get( -# url, -# headers={'User-Agent': WGET_USER_AGENT}, -# verify=CHECK_SSL_VALIDITY, -# timeout=timeout, -# ) -# if response.headers.get('Content-Type') == 'application/rss+xml': -# # Based on https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py -# _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' -# _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') -# _BODY_ENCODING_PATTERN = r'<\s*(\?xml\s[^>]+%s)' % (_XML_ENCODING_RE) -# _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE) -# match = _BODY_ENCODING_STR_RE.search(response.text[:1024]) -# if match: -# response.encoding = match.group('xmlcharset') -# return response.text \ No newline at end of file +from archivebox import util + +def test_download_url_downloads_content(): + text = util.download_url("https://example.com") + assert "Example Domain" in text \ No newline at end of file From 4302ae4caa4fccbe40e67084d4b3edd315e9eb1f Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 3 Jul 2020 13:13:59 -0500 Subject: [PATCH 129/333] fix: Remove test section in setup.py --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 12002580..9ca39608 100755 --- a/setup.py +++ b/setup.py @@ -64,10 +64,8 @@ setuptools.setup( "sphinx", "sphinx-rtd-theme", "recommonmark", + "pytest", ], - "test": [ - "pytest" - ] # 'redis': ['redis', 'django-redis'], # 'pywb': ['pywb', 'redis'], }, From ffaae510779b49b44450c58c3c631a29f065ae32 Mon Sep 17 00:00:00 2001 From: apkallum Date: Fri, 3 Jul 2020 16:52:28 -0400 Subject: [PATCH 130/333] test github actions --- .github/workflows/test.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..311236c0 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,25 @@ +name: Test workflow +on: [push] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + with: + fetch-depth: 1 + + - name: Set up Python 3.7 + uses: actions/setup-python@v1 + with: + python-version: 3.7 + architecture: x64 + + - name: Install dependencies + run: | + pip install -e .[dev] + + - name: Test with pytest + run: | + pytest -s \ No newline at end of file From d5fc13b34e0f29c67b52c05a3ba098f049830e60 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 08:36:58 -0500 Subject: [PATCH 131/333] refactor: Move pytest fixtures to its own file --- tests/__init__.py | 0 tests/fixtures.py | 10 ++++++++++ tests/test_args.py | 0 tests/test_init.py | 9 +-------- 4 files changed, 11 insertions(+), 8 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/fixtures.py create mode 100644 tests/test_args.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 00000000..9bf2640a --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,10 @@ +import os +import subprocess + +import pytest + +@pytest.fixture +def process(tmp_path): + os.chdir(tmp_path) + process = subprocess.run(['archivebox', 'init'], capture_output=True) + return process \ No newline at end of file diff --git a/tests/test_args.py b/tests/test_args.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_init.py b/tests/test_init.py index b870a599..1b80bb1b 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -6,14 +6,7 @@ import subprocess from pathlib import Path import json -import pytest - -@pytest.fixture -def process(tmp_path): - os.chdir(tmp_path) - process = subprocess.run(['archivebox', 'init'], capture_output=True) - return process - +from .fixtures import * def test_init(tmp_path, process): assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8") From 8b22a2a7dd2507e164f0780fa38d73ba36912144 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 09:10:36 -0500 Subject: [PATCH 132/333] feat: Enable --depth flag (still does nothing) --- archivebox/cli/archivebox_add.py | 13 +++++++------ tests/test_args.py | 7 +++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 272fe5cf..77a11bd0 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -45,6 +45,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ' ~/Desktop/sites_list.csv\n' ) ) + parser.add_argument( + "--depth", + action="store", + default=0, + type=int, + help="Recursively archive all linked pages up to this many hops away" + ) command = parser.parse_args(args or ()) import_str = accept_stdin(stdin) add( @@ -63,12 +70,6 @@ if __name__ == '__main__': # TODO: Implement these # # parser.add_argument( -# '--depth', #'-d', -# type=int, -# help='Recursively archive all linked pages up to this many hops away', -# default=0, -# ) -# parser.add_argument( # '--mirror', #'-m', # action='store_true', # help='Archive an entire site (finding all linked pages below it on the same domain)', diff --git a/tests/test_args.py b/tests/test_args.py index e69de29b..b8df1941 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -0,0 +1,7 @@ +import subprocess + +from .fixtures import * + +def test_depth_flag_is_accepted(tmp_path, process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) + assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') \ No newline at end of file From 2db03245398f0a6c7fcda77a3ebc5688e3836396 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 09:49:28 -0500 Subject: [PATCH 133/333] feat: depth=0 crawls the current page only --- archivebox/cli/archivebox_add.py | 14 +++++++++++--- tests/test_args.py | 12 ++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 77a11bd0..5bbccb19 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -53,14 +53,22 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - import_str = accept_stdin(stdin) + #import_str = accept_stdin(stdin) add( - import_str=import_str, - import_path=command.import_path, + import_str=command.import_path, + import_path=None, update_all=command.update_all, index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, ) + #if command.depth == 1: + # add( + # import_str=None, + # import_path=command.import_path, + # update_all=command.update_all, + # index_only=command.index_only, + # out_dir=pwd or OUTPUT_DIR, + # ) if __name__ == '__main__': diff --git a/tests/test_args.py b/tests/test_args.py index b8df1941..59d43fee 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -1,7 +1,15 @@ import subprocess +import json from .fixtures import * -def test_depth_flag_is_accepted(tmp_path, process): +def test_depth_flag_is_accepted(process): arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) - assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') \ No newline at end of file + assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') + +def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + with open(archived_item_path / "index.json", "r") as f: + output_json = json.load(f) + assert output_json["base_url"] == "example.com" \ No newline at end of file From 32e790979e2f37c3615b52e0ed858603abd429a5 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 10:07:44 -0500 Subject: [PATCH 134/333] feat: Enable depth=1 functionality --- archivebox/cli/archivebox_add.py | 16 ++++++++-------- tests/test_args.py | 9 ++++++++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 5bbccb19..65335679 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -61,14 +61,14 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, ) - #if command.depth == 1: - # add( - # import_str=None, - # import_path=command.import_path, - # update_all=command.update_all, - # index_only=command.index_only, - # out_dir=pwd or OUTPUT_DIR, - # ) + if command.depth == 1: + add( + import_str=None, + import_path=command.import_path, + update_all=command.update_all, + index_only=command.index_only, + out_dir=pwd or OUTPUT_DIR, + ) if __name__ == '__main__': diff --git a/tests/test_args.py b/tests/test_args.py index 59d43fee..e0c6020e 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -12,4 +12,11 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert output_json["base_url"] == "example.com" \ No newline at end of file + assert output_json["base_url"] == "example.com" + +def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=1"], capture_output=True) + with open(tmp_path / "index.json", "r") as f: + archive_file = f.read() + assert "https://example.com" in archive_file + assert "https://www.iana.org/domains/example" in archive_file \ No newline at end of file From a6940092bbf37123e68e2c22418584fa9b4a2d88 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 10:25:02 -0500 Subject: [PATCH 135/333] feat: Make sure that depth can only be either 1 or 0 --- archivebox/cli/archivebox_add.py | 2 +- tests/test_args.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 65335679..2f77f754 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -49,11 +49,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional "--depth", action="store", default=0, + choices=[0,1], type=int, help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - #import_str = accept_stdin(stdin) add( import_str=command.import_path, import_path=None, diff --git a/tests/test_args.py b/tests/test_args.py index e0c6020e..91264ef2 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -5,7 +5,13 @@ from .fixtures import * def test_depth_flag_is_accepted(process): arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) - assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode('utf-8') + assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") + +def test_depth_flag_fails_if_it_is_not_0_or_1(process): + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=5"], capture_output=True) + assert 'invalid choice' in arg_process.stderr.decode("utf-8") + arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=-1"], capture_output=True) + assert 'invalid choice' in arg_process.stderr.decode("utf-8") def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) @@ -19,4 +25,4 @@ def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): with open(tmp_path / "index.json", "r") as f: archive_file = f.read() assert "https://example.com" in archive_file - assert "https://www.iana.org/domains/example" in archive_file \ No newline at end of file + assert "https://www.iana.org/domains/example" in archive_file From bca6a06f6035e7a10c9726ef40e7aed4b4b7ee34 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 11:53:02 -0500 Subject: [PATCH 136/333] test: Fix test to reflect new API changes --- tests/test_init.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_init.py b/tests/test_init.py index 1b80bb1b..c5627a2f 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -25,9 +25,9 @@ def test_add_link(tmp_path, process): with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert "IANA — IANA-managed Reserved Domains" == output_json['history']['title'][0]['output'] + assert "Example Domain" == output_json['history']['title'][0]['output'] with open(tmp_path / "index.html", "r") as f: output_html = f.read() - assert "IANA — IANA-managed Reserved Domains" in output_html + assert "Example Domain" in output_html From b68c13918f28246e8521080a03486dcbb7ff8537 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 12:39:36 -0500 Subject: [PATCH 137/333] feat: Disable stdin from archivebox add --- archivebox/cli/archivebox_add.py | 6 ++++-- archivebox/main.py | 3 +-- tests/test_init.py | 6 ++++++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 2f77f754..c729e9fb 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -10,7 +10,7 @@ from typing import List, Optional, IO from ..main import add, docstring from ..config import OUTPUT_DIR, ONLY_NEW -from .logging import SmartFormatter, accept_stdin +from .logging import SmartFormatter, reject_stdin @docstring(add.__doc__) @@ -38,9 +38,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional type=str, default=None, help=( - 'URL or path to local file containing a list of links to import. e.g.:\n' + 'URL or path to local file containing a page or list of links to import. e.g.:\n' ' https://getpocket.com/users/USERNAME/feed/all\n' ' https://example.com/some/rss/feed.xml\n' + ' https://example.com\n' ' ~/Downloads/firefox_bookmarks_export.html\n' ' ~/Desktop/sites_list.csv\n' ) @@ -54,6 +55,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) + reject_stdin(__command__, stdin) add( import_str=command.import_path, import_path=None, diff --git a/archivebox/main.py b/archivebox/main.py index f1fb98ce..3f05a385 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -507,8 +507,7 @@ def add(import_str: Optional[str]=None, if (import_str and import_path) or (not import_str and not import_path): stderr( - '[X] You should pass either an import path as an argument, ' - 'or pass a list of links via stdin, but not both.\n', + '[X] You should pass an import path or a page url as an argument\n', color='red', ) raise SystemExit(2) diff --git a/tests/test_init.py b/tests/test_init.py index c5627a2f..d592b0a1 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -31,3 +31,9 @@ def test_add_link(tmp_path, process): output_html = f.read() assert "Example Domain" in output_html +def test_add_link_does_not_support_stdin(tmp_path, process): + os.chdir(tmp_path) + stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = stdin_process.communicate(input="example.com".encode())[0] + assert "does not accept stdin" in output.decode("utf-8") + From c1d8a74e4f2673047e31b96aa303fbd300dccc50 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 7 Jul 2020 15:46:45 -0500 Subject: [PATCH 138/333] feat: Make input sent via stdin behave the same as using args --- archivebox/cli/archivebox_add.py | 19 +++++++++++++++---- tests/test_init.py | 12 +++++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index c729e9fb..c692750b 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -10,7 +10,7 @@ from typing import List, Optional, IO from ..main import add, docstring from ..config import OUTPUT_DIR, ONLY_NEW -from .logging import SmartFormatter, reject_stdin +from .logging import SmartFormatter, accept_stdin @docstring(add.__doc__) @@ -55,9 +55,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) + import_string = accept_stdin(stdin) + if import_string and command.import_path: + stderr( + '[X] You should pass an import path or a page url as an argument or in stdin but not both\n', + color='red', + ) + raise SystemExit(2) + elif import_string: + import_path = import_string + else: + import_path = command.import_path + add( - import_str=command.import_path, + import_str=import_path, import_path=None, update_all=command.update_all, index_only=command.index_only, @@ -66,7 +77,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional if command.depth == 1: add( import_str=None, - import_path=command.import_path, + import_path=import_path, update_all=command.update_all, index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, diff --git a/tests/test_init.py b/tests/test_init.py index d592b0a1..97870459 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -31,9 +31,15 @@ def test_add_link(tmp_path, process): output_html = f.read() assert "Example Domain" in output_html -def test_add_link_does_not_support_stdin(tmp_path, process): +def test_add_link_support_stdin(tmp_path, process): os.chdir(tmp_path) stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - output = stdin_process.communicate(input="example.com".encode())[0] - assert "does not accept stdin" in output.decode("utf-8") + stdin_process.communicate(input="http://example.com".encode()) + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + + assert "index.json" in [x.name for x in archived_item_path.iterdir()] + + with open(archived_item_path / "index.json", "r") as f: + output_json = json.load(f) + assert "Example Domain" == output_json['history']['title'][0]['output'] From f12bfeb3229345b2d4cd7c1670ba050ca1111e7c Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 8 Jul 2020 08:17:47 -0500 Subject: [PATCH 139/333] refactor: Change add() to receive url and depth instead of import_str and import_path --- archivebox/cli/archivebox_add.py | 12 ++---------- archivebox/core/views.py | 8 +++----- archivebox/main.py | 25 ++++++++++--------------- 3 files changed, 15 insertions(+), 30 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index c692750b..8f491d42 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -68,20 +68,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional import_path = command.import_path add( - import_str=import_path, - import_path=None, + url=import_path, + depth=command.depth, update_all=command.update_all, index_only=command.index_only, out_dir=pwd or OUTPUT_DIR, ) - if command.depth == 1: - add( - import_str=None, - import_path=import_path, - update_all=command.update_all, - index_only=command.index_only, - out_dir=pwd or OUTPUT_DIR, - ) if __name__ == '__main__': diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 0c5efff2..a721b992 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -66,12 +66,10 @@ class AddLinks(View): if form.is_valid(): url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') - if form.cleaned_data["source"] == "url": - key = "import_str" - else: - key = "import_path" + depth = 0 if form.cleaned_data["source"] == "url" else 1 input_kwargs = { - key: url, + "url": url, + "depth": depth, "update_all": False, "out_dir": OUTPUT_DIR, } diff --git a/archivebox/main.py b/archivebox/main.py index 3f05a385..a96c4250 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -496,8 +496,8 @@ def status(out_dir: str=OUTPUT_DIR) -> None: @enforce_types -def add(import_str: Optional[str]=None, - import_path: Optional[str]=None, +def add(url: str, + depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, out_dir: str=OUTPUT_DIR) -> List[Link]: @@ -505,17 +505,9 @@ def add(import_str: Optional[str]=None, check_data_folder(out_dir=out_dir) - if (import_str and import_path) or (not import_str and not import_path): - stderr( - '[X] You should pass an import path or a page url as an argument\n', - color='red', - ) - raise SystemExit(2) - elif import_str: - import_path = save_stdin_to_sources(import_str, out_dir=out_dir) - elif import_path: - import_path = save_file_to_sources(import_path, out_dir=out_dir) - + base_path = save_stdin_to_sources(url, out_dir=out_dir) + if depth == 1: + depth_path = save_file_to_sources(url, out_dir=out_dir) check_dependencies() # Step 1: Load list of links from the existing index @@ -523,8 +515,11 @@ def add(import_str: Optional[str]=None, all_links: List[Link] = [] new_links: List[Link] = [] all_links = load_main_index(out_dir=out_dir) - if import_path: - all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir) + all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir) + if depth == 1: + all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir) + new_links = new_links + new_links_depth + # Step 2: Write updated index with deduped old and new links back to disk write_main_index(links=all_links, out_dir=out_dir) From 4ebf929606b50afcce94f2440a7ac363cc96a887 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 8 Jul 2020 08:30:07 -0500 Subject: [PATCH 140/333] refactor: Change wording on CLI help --- archivebox/cli/archivebox_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 8f491d42..c4c78399 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -38,7 +38,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional type=str, default=None, help=( - 'URL or path to local file containing a page or list of links to import. e.g.:\n' + 'URL or path to local file to start the archiving process from. e.g.:\n' ' https://getpocket.com/users/USERNAME/feed/all\n' ' https://example.com/some/rss/feed.xml\n' ' https://example.com\n' From d476b130074a18e0a903743bdd3e61b5f7f397b0 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 8 Jul 2020 14:46:31 -0500 Subject: [PATCH 141/333] fix: Add missing permission to add view (post) --- archivebox/core/views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 0c5efff2..57941264 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -60,8 +60,8 @@ class AddLinks(View): return render(template_name=self.template, request=request, context=context) def post(self, request): - #url = request.POST['url'] - #if url: + if not request.user.is_authenticated and not PUBLIC_INDEX: + return redirect(f'/admin/login/?next={request.path}') form = AddLinkForm(request.POST) if form.is_valid(): url = form.cleaned_data["url"] From 09b4438c9f5ad89c9cc46bdc3c4df131420a8b37 Mon Sep 17 00:00:00 2001 From: Apkallum Date: Wed, 8 Jul 2020 17:54:01 -0400 Subject: [PATCH 142/333] fix legacy index.html --- archivebox/themes/legacy/main_index.html | 73 +----------------------- 1 file changed, 2 insertions(+), 71 deletions(-) diff --git a/archivebox/themes/legacy/main_index.html b/archivebox/themes/legacy/main_index.html index 1b366300..e246b0d9 100644 --- a/archivebox/themes/legacy/main_index.html +++ b/archivebox/themes/legacy/main_index.html @@ -4,34 +4,6 @@ Archived Sites + + + +
+

Example Domain

+

This domain is for use in illustrative examples in documents. You may use this + domain in literature without prior coordination or asking for permission.

+

+ More information... +

+
+ + diff --git a/tests/mock_server/templates/iana.org.html b/tests/mock_server/templates/iana.org.html new file mode 100644 index 00000000..c1e60a2e --- /dev/null +++ b/tests/mock_server/templates/iana.org.html @@ -0,0 +1,390 @@ + + + + IANA — IANA-managed Reserved Domains + + + + + + + + + + + + + + + + + +
+ +
+ +
+ + +
+ + +

IANA-managed Reserved Domains

+ +

Certain domains are set aside, and nominally registered to “IANA”, for specific + policy or technical purposes.

+ +

Example domains

+ +

As described in + RFC 2606 + and + RFC 6761, + a number of domains such as + example.com + and + example.org + are maintained for documentation purposes. These domains may be used as illustrative + examples in documents without prior coordination with us. They are + not available for registration or transfer.

+ +

Test IDN top-level domains

+ +

These domains were temporarily delegated by IANA for the + IDN Evaluation + being conducted by + ICANN.

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DomainDomain (A-label)LanguageScript
إختبار + + XN--KGBECHTV + + ArabicArabic
آزمایشی + + XN--HGBK6AJ7F53BBA + + PersianArabic
测试 + + XN--0ZWM56D + + ChineseHan (Simplified variant)
測試 + + XN--G6W251D + + ChineseHan (Traditional variant)
испытание + + XN--80AKHBYKNJ4F + + RussianCyrillic
परीक्षा + + XN--11B5BS3A9AJ6G + + HindiDevanagari (Nagari)
δοκιμή + + XN--JXALPDLP + + Greek, Modern (1453-)Greek
테스트 + + XN--9T4B11YI5A + + KoreanHangul (Hangŭl, Hangeul)
טעסט + + XN--DEBA0AD + + YiddishHebrew
テスト + + XN--ZCKZAH + + JapaneseKatakana
பரிட்சை + + XN--HLCJ6AYA9ESC7A + + TamilTamil
+
+ +

Policy-reserved domains

+ +

We act as both the registrant and registrar for a select number of domains + which have been reserved under policy grounds. These exclusions are + typically indicated in either technical standards (RFC documents), + or + contractual limitations.

+ +

Domains which are described as registered to IANA or ICANN on policy + grounds are not available for registration or transfer, with the exception + of + + country-name.info + domains. These domains are available for release + by the ICANN Governmental Advisory Committee Secretariat.

+ +

Other Special-Use Domains

+ +

There is additionally a + Special-Use Domain Names + registry documenting special-use domains designated by technical standards. For further information, see + Special-Use Domain Names + (RFC 6761).

+ + +
+ + + + +
+ + diff --git a/tests/test_args.py b/tests/test_args.py index 91264ef2..f52626fb 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -4,25 +4,25 @@ import json from .fixtures import * def test_depth_flag_is_accepted(process): - arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=0"], capture_output=True) assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") def test_depth_flag_fails_if_it_is_not_0_or_1(process): - arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=5"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=5"], capture_output=True) assert 'invalid choice' in arg_process.stderr.decode("utf-8") - arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=-1"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=-1"], capture_output=True) assert 'invalid choice' in arg_process.stderr.decode("utf-8") def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=0"], capture_output=True) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert output_json["base_url"] == "example.com" + assert output_json["base_url"] == "localhost:8080/static/example.com.html" def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=1"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=1"], capture_output=True) with open(tmp_path / "index.json", "r") as f: archive_file = f.read() - assert "https://example.com" in archive_file - assert "https://www.iana.org/domains/example" in archive_file + assert "http://localhost:8080/static/example.com.html" in archive_file + assert "http://localhost:8080/static/iana.org.html" in archive_file From fe80a93a0380a11a3196f194c13bf9ae13531e4e Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 13 Jul 2020 09:43:36 -0500 Subject: [PATCH 146/333] test: Refactor init tests to use local webserver --- tests/test_init.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_init.py b/tests/test_init.py index 97870459..24d3ed52 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -18,7 +18,7 @@ def test_update(tmp_path, process): def test_add_link(tmp_path, process): os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://example.com'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/example.com.html'], capture_output=True) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert "index.json" in [x.name for x in archived_item_path.iterdir()] @@ -34,7 +34,7 @@ def test_add_link(tmp_path, process): def test_add_link_support_stdin(tmp_path, process): os.chdir(tmp_path) stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - stdin_process.communicate(input="http://example.com".encode()) + stdin_process.communicate(input="http://localhost:8080/static/example.com.html".encode()) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert "index.json" in [x.name for x in archived_item_path.iterdir()] From 322997e229457bf43ee2281993ccdc30c8455244 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 13 Jul 2020 09:44:50 -0500 Subject: [PATCH 147/333] test: Refactor util tests to use local webserver --- tests/test_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_util.py b/tests/test_util.py index 1497de5a..0a076344 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,5 +1,5 @@ from archivebox import util def test_download_url_downloads_content(): - text = util.download_url("https://example.com") + text = util.download_url("http://localhost:8080/static/example.com.html") assert "Example Domain" in text \ No newline at end of file From 7cbd068c95e5a40851a40e9ed272b62c49a885e9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:22:07 -0400 Subject: [PATCH 148/333] add flake8 --- .flake8 | 6 ++++++ archivebox/.flake8 | 8 +++++--- archivebox/__main__.py | 1 + archivebox/config/__init__.py | 4 +++- archivebox/core/models.py | 1 - archivebox/index/schema.py | 1 + archivebox/main.py | 4 ++-- 7 files changed, 18 insertions(+), 7 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..01af646d --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391 +select = F,E9,W +max-line-length = 130 +max-complexity = 10 +exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv diff --git a/archivebox/.flake8 b/archivebox/.flake8 index 46da144b..dd6ba8e4 100644 --- a/archivebox/.flake8 +++ b/archivebox/.flake8 @@ -1,4 +1,6 @@ [flake8] -ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E127,E131,E241,E252,E266,E272,E701,E731,W293,W503 -select = F,E9 -exclude = migrations,util_scripts,node_modules,venv +ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391 +select = F,E9,W +max-line-length = 130 +max-complexity = 10 +exclude = migrations,tests,node_modules,vendor,static,venv,.venv,.venv2,.docker-venv diff --git a/archivebox/__main__.py b/archivebox/__main__.py index 55e94415..8afaa27a 100755 --- a/archivebox/__main__.py +++ b/archivebox/__main__.py @@ -6,5 +6,6 @@ import sys from .cli import main + if __name__ == '__main__': main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index f06b0f3d..14b66e92 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -279,6 +279,8 @@ def load_config_val(key: str, config: Optional[ConfigDict]=None, env_vars: Optional[os._Environ]=None, config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue: + """parse bool, int, and str key=value pairs from env""" + config_keys_to_check = (key, *(aliases or ())) for key in config_keys_to_check: @@ -777,7 +779,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr() stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.') - stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)') + stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)') stderr() stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles') diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 2cbfc1b1..42929e5a 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -24,7 +24,6 @@ class Snapshot(models.Model): keys = ('url', 'timestamp', 'title', 'tags', 'updated') - def __repr__(self) -> str: title = self.title or '-' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 637e0589..db17c269 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -98,6 +98,7 @@ class Link: updated: Optional[datetime] = None schema: str = 'Link' + def __str__(self) -> str: return f'[{self.timestamp}] {self.base_url} "{self.title}"' diff --git a/archivebox/main.py b/archivebox/main.py index a96c4250..a6e04dd3 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -641,8 +641,8 @@ def update(resume: Optional[float]=None, out_dir: str=OUTPUT_DIR) -> List[Link]: """Import any new links from subscriptions and retry any previously failed/skipped links""" - check_dependencies() check_data_folder(out_dir=out_dir) + check_dependencies() # Step 1: Load list of links from the existing index # merge in and dedupe new links from import_path @@ -990,7 +990,7 @@ def schedule(add: bool=False, if total_runs > 60 and not quiet: stderr() stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI)) - stderr(f' Congrats on being an enthusiastic internet archiver! 👌') + stderr(' Congrats on being an enthusiastic internet archiver! 👌') stderr() stderr(' Make sure you have enough storage space available to hold all the data.') stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') From 96b1e4a8ec1eb64c979c185b912ef6d60b25074f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:22:58 -0400 Subject: [PATCH 149/333] accept local paths as valid link URLs when parsing --- archivebox/parsers/generic_txt.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index cc3653a0..61d1973f 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -5,6 +5,7 @@ import re from typing import IO, Iterable from datetime import datetime +from pathlib import Path from ..index.schema import Link from ..util import ( @@ -13,14 +14,28 @@ from ..util import ( URL_REGEX ) + @enforce_types def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]: """Parse raw links from each line in a text file""" text_file.seek(0) for line in text_file.readlines(): - urls = re.findall(URL_REGEX, line) if line.strip() else () - for url in urls: # type: ignore + if not line.strip(): + continue + + # if the line is a local file path that resolves, then we can archive it + if Path(line).exists(): + yield Link( + url=line, + timestamp=str(datetime.now().timestamp()), + title=None, + tags=None, + sources=[text_file.name], + ) + + # otherwise look for anything that looks like a URL in the line + for url in re.findall(URL_REGEX, line): yield Link( url=htmldecode(url), timestamp=str(datetime.now().timestamp()), From 16f3746712e3767ea3ab1ef0aec3cc38108b331b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:24:36 -0400 Subject: [PATCH 150/333] check source dir at the end of checking data dir --- archivebox/config/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 14b66e92..3638bade 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -838,6 +838,10 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> stderr(' archivebox init') raise SystemExit(3) + sources_dir = os.path.join(output_dir, SOURCES_DIR_NAME) + if not os.path.exists(sources_dir): + os.makedirs(sources_dir) + def setup_django(out_dir: str=None, check_db=False, config: ConfigDict=CONFIG) -> None: From dfb83b4f2728f2f0a389650836d6164a2f80e809 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:24:49 -0400 Subject: [PATCH 151/333] add AttributeDict --- archivebox/util.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/archivebox/util.py b/archivebox/util.py index 8fdda389..0e7ebd31 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -230,6 +230,23 @@ def ansi_to_html(text): return COLOR_REGEX.sub(single_sub, text) +class AttributeDict(dict): + """Helper to allow accessing dict values via Example.key or Example['key']""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Recursively convert nested dicts to AttributeDicts (optional): + # for key, val in self.items(): + # if isinstance(val, dict) and type(val) is not AttributeDict: + # self[key] = AttributeDict(val) + + def __getattr__(self, attr: str) -> Any: + return dict.__getitem__(self, attr) + + def __setattr__(self, attr: str, value: Any) -> None: + return dict.__setitem__(self, attr, value) + + class ExtendedEncoder(pyjson.JSONEncoder): """ Extended json serializer that supports serializing several model From 354a63ccd4f021c68747c8a16d30cd54f67167b8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:25:43 -0400 Subject: [PATCH 152/333] dont dedupe snapshots in sqlite on every run --- archivebox/index/sql.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 0ad68de0..80203980 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -26,23 +26,8 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: from core.models import Snapshot from django.db import transaction - all_urls = {link.url: link for link in links} - all_ts = {link.timestamp: link for link in links} - with transaction.atomic(): - for snapshot in Snapshot.objects.all(): - if snapshot.timestamp in all_ts: - info = {k: v for k, v in all_urls.pop(snapshot.url)._asdict().items() if k in Snapshot.keys} - snapshot.delete() - Snapshot.objects.create(**info) - elif snapshot.url in all_urls: - info = {k: v for k, v in all_urls.pop(snapshot.url)._asdict().items() if k in Snapshot.keys} - snapshot.delete() - Snapshot.objects.create(**info) - else: - snapshot.delete() - - for url, link in all_urls.items(): + for link in links: info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} Snapshot.objects.update_or_create(url=url, defaults=info) From d3bfa98a912fe4a360835b1e32258244ffa12262 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:26:30 -0400 Subject: [PATCH 153/333] fix depth flag and tweak logging --- archivebox/cli/__init__.py | 12 +++- archivebox/cli/archivebox_add.py | 24 +++---- archivebox/cli/logging.py | 61 ++++++++++++------ archivebox/extractors/__init__.py | 27 +++++++- archivebox/index/__init__.py | 29 +++++---- archivebox/main.py | 102 ++++++++++++------------------ archivebox/parsers/__init__.py | 28 ++------ 7 files changed, 156 insertions(+), 127 deletions(-) diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 087f11b5..b7575c4a 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -106,8 +106,18 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, if command.help or command.subcommand is None: command.subcommand = 'help' - if command.version: + elif command.version: command.subcommand = 'version' + + if command.subcommand not in ('help', 'version', 'status'): + from ..cli.logging import log_cli_command + + log_cli_command( + subcommand=command.subcommand, + subcommand_args=command.subcommand_args, + stdin=stdin, + pwd=pwd or OUTPUT_DIR + ) run_subcommand( subcommand=command.subcommand, diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index c4c78399..55832346 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -10,7 +10,7 @@ from typing import List, Optional, IO from ..main import add, docstring from ..config import OUTPUT_DIR, ONLY_NEW -from .logging import SmartFormatter, accept_stdin +from .logging import SmartFormatter, accept_stdin, stderr @docstring(add.__doc__) @@ -33,12 +33,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Add the links to the main index without archiving them", ) parser.add_argument( - 'import_path', - nargs='?', + 'urls', + nargs='*', type=str, default=None, help=( - 'URL or path to local file to start the archiving process from. e.g.:\n' + 'URLs or paths to archive e.g.:\n' ' https://getpocket.com/users/USERNAME/feed/all\n' ' https://example.com/some/rss/feed.xml\n' ' https://example.com\n' @@ -50,25 +50,21 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional "--depth", action="store", default=0, - choices=[0,1], + choices=[0, 1], type=int, help="Recursively archive all linked pages up to this many hops away" ) command = parser.parse_args(args or ()) - import_string = accept_stdin(stdin) - if import_string and command.import_path: + urls = command.urls + stdin_urls = accept_stdin(stdin) + if (stdin_urls and urls) or (not stdin and not urls): stderr( - '[X] You should pass an import path or a page url as an argument or in stdin but not both\n', + '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', color='red', ) raise SystemExit(2) - elif import_string: - import_path = import_string - else: - import_path = command.import_path - add( - url=import_path, + urls=stdin_urls or urls, depth=command.depth, update_all=command.update_all, index_only=command.index_only, diff --git a/archivebox/cli/logging.py b/archivebox/cli/logging.py index 6de78d8f..a12c4e98 100644 --- a/archivebox/cli/logging.py +++ b/archivebox/cli/logging.py @@ -5,10 +5,12 @@ import os import sys import time import argparse +import logging +import signal +from multiprocessing import Process from datetime import datetime from dataclasses import dataclass -from multiprocessing import Process from typing import Optional, List, Dict, Union, IO from ..index.schema import Link, ArchiveResult @@ -23,11 +25,11 @@ from ..config import ( SHOW_PROGRESS, TERM_WIDTH, OUTPUT_DIR, + SOURCES_DIR_NAME, HTML_INDEX_FILENAME, stderr, ) - @dataclass class RuntimeStats: """mutable stats counter for logging archiving timing info to CLI output""" @@ -98,9 +100,9 @@ class TimedProgress: if SHOW_PROGRESS: # terminate if we havent already terminated - if self.p is not None: - self.p.terminate() - self.p = None + self.p.terminate() + self.p.join() + self.p.close() # clear whole terminal line try: @@ -145,28 +147,51 @@ def progress_bar(seconds: int, prefix: str='') -> None: seconds, )) sys.stdout.flush() - except KeyboardInterrupt: + except (KeyboardInterrupt, BrokenPipeError): print() pass +def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str): + from ..config import VERSION, ANSI + cmd = ' '.join(('archivebox', subcommand, *subcommand_args)) + stdin_hint = ' < /dev/stdin' if not stdin.isatty() else '' + print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format( + now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + VERSION=VERSION, + cmd=cmd, + stdin_hint=stdin_hint, + **ANSI, + )) + print('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI)) + print() + ### Parsing Stage -def log_parsing_started(source_file: str): - start_ts = datetime.now() - _LAST_RUN_STATS.parse_start_ts = start_ts - print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - source_file.rsplit('/', 1)[-1], + +def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool): + _LAST_RUN_STATS.parse_start_ts = datetime.now() + print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format( + _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'), + len(urls) if isinstance(urls, list) else len(urls.split('\n')), + depth, + ' (index only)' if index_only else '', **ANSI, )) +def log_source_saved(source_file: str): + print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1])) -def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str): - end_ts = datetime.now() - _LAST_RUN_STATS.parse_end_ts = end_ts - print(' > Parsed {} links as {} ({} new links added)'.format(num_parsed, parser_name, num_new_links)) +def log_parsing_finished(num_parsed: int, parser_name: str): + _LAST_RUN_STATS.parse_end_ts = datetime.now() + print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name)) +def log_deduping_finished(num_new_links: int): + print(' > Found {} new URLs not already in index'.format(num_new_links)) + + +def log_crawl_started(new_links): + print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI)) ### Indexing Stage @@ -174,7 +199,7 @@ def log_indexing_process_started(num_links: int): start_ts = datetime.now() _LAST_RUN_STATS.index_start_ts = start_ts print() - print('{green}[*] [{}] Writing {} links to main index...{reset}'.format( + print('{black}[*] [{}] Writing {} links to main index...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), num_links, **ANSI, @@ -209,7 +234,7 @@ def log_archiving_started(num_links: int, resume: Optional[float]=None): **ANSI, )) else: - print('{green}[▶] [{}] Updating content for {} matching pages in archive...{reset}'.format( + print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format( start_ts.strftime('%Y-%m-%d %H:%M:%S'), num_links, **ANSI, diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index c6a4f33c..c08e7c0c 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -2,7 +2,7 @@ __package__ = 'archivebox.extractors' import os -from typing import Optional +from typing import Optional, List from datetime import datetime from ..index.schema import Link @@ -13,6 +13,9 @@ from ..index import ( ) from ..util import enforce_types from ..cli.logging import ( + log_archiving_started, + log_archiving_paused, + log_archiving_finished, log_link_archiving_started, log_link_archiving_finished, log_archive_method_started, @@ -103,3 +106,25 @@ def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) raise return link + + +@enforce_types +def archive_links(links: List[Link], out_dir: Optional[str]=None) -> List[Link]: + if not links: + return [] + + log_archiving_started(len(links)) + idx: int = 0 + link: Link = links[0] + try: + for idx, link in enumerate(links): + archive_link(link, out_dir=link.link_dir) + except KeyboardInterrupt: + log_archiving_paused(len(links), idx, link.timestamp) + raise SystemExit(0) + except BaseException: + print() + raise + + log_archiving_finished(len(links)) + return links diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index e82cfefa..7ea473d7 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -33,8 +33,8 @@ from ..cli.logging import ( log_indexing_process_finished, log_indexing_started, log_indexing_finished, - log_parsing_started, log_parsing_finished, + log_deduping_finished, ) from .schema import Link, ArchiveResult @@ -268,20 +268,31 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]: return None + @enforce_types -def import_new_links(existing_links: List[Link], - import_path: str, - out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]: +def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]: from ..parsers import parse_links new_links: List[Link] = [] # parse and validate the import file - log_parsing_started(import_path) - raw_links, parser_name = parse_links(import_path) + raw_links, parser_name = parse_links(source_path) new_links = validate_links(raw_links) + if parser_name: + num_parsed = len(raw_links) + log_parsing_finished(num_parsed, parser_name) + + return new_links + + +@enforce_types +def dedupe_links(existing_links: List[Link], + new_links: List[Link]) -> Tuple[List[Link], List[Link]]: + + from ..parsers import parse_links + # merge existing links in out_dir and new links all_links = validate_links(existing_links + new_links) all_link_urls = {link.url for link in existing_links} @@ -290,11 +301,7 @@ def import_new_links(existing_links: List[Link], link for link in new_links if link.url not in all_link_urls ] - - if parser_name: - num_parsed = len(raw_links) - num_new_links = len(all_links) - len(existing_links) - log_parsing_finished(num_parsed, num_new_links, parser_name) + log_deduping_finished(len(new_links)) return all_links, new_links diff --git a/archivebox/main.py b/archivebox/main.py index a6e04dd3..54b71acc 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -4,8 +4,7 @@ import os import sys import shutil -from typing import Dict, List, Optional, Iterable, IO - +from typing import Dict, List, Optional, Iterable, IO, Union from crontab import CronTab, CronSlices from .cli import ( @@ -17,16 +16,17 @@ from .cli import ( archive_cmds, ) from .parsers import ( - save_stdin_to_sources, - save_file_to_sources, + save_text_as_source, + save_file_as_source, ) from .index.schema import Link -from .util import enforce_types, docstring +from .util import enforce_types, docstring # type: ignore from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .index import ( links_after_timestamp, load_main_index, - import_new_links, + parse_links_from_source, + dedupe_links, write_main_index, link_matches_filter, get_indexed_folders, @@ -51,7 +51,7 @@ from .index.sql import ( apply_migrations, ) from .index.html import parse_html_main_index -from .extractors import archive_link +from .extractors import archive_links from .config import ( stderr, ConfigDict, @@ -91,9 +91,8 @@ from .config import ( from .cli.logging import ( TERM_WIDTH, TimedProgress, - log_archiving_started, - log_archiving_paused, - log_archiving_finished, + log_importing_started, + log_crawl_started, log_removal_started, log_removal_finished, log_list_started, @@ -496,59 +495,55 @@ def status(out_dir: str=OUTPUT_DIR) -> None: @enforce_types -def add(url: str, +def add(urls: Union[str, List[str]], depth: int=0, update_all: bool=not ONLY_NEW, index_only: bool=False, out_dir: str=OUTPUT_DIR) -> List[Link]: """Add a new URL or list of URLs to your archive""" + assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' + + # Load list of links from the existing index check_data_folder(out_dir=out_dir) - - base_path = save_stdin_to_sources(url, out_dir=out_dir) - if depth == 1: - depth_path = save_file_to_sources(url, out_dir=out_dir) check_dependencies() - - # Step 1: Load list of links from the existing index - # merge in and dedupe new links from import_path all_links: List[Link] = [] new_links: List[Link] = [] all_links = load_main_index(out_dir=out_dir) - all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir) - if depth == 1: - all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir) - new_links = new_links + new_links_depth + + log_importing_started(urls=urls, depth=depth, index_only=index_only) + if isinstance(urls, str): + # save verbatim stdin to sources + write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir) + elif isinstance(urls, list): + # save verbatim args to sources + write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) + + new_links += parse_links_from_source(write_ahead_log) + all_links, new_links = dedupe_links(all_links, new_links) + write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) - # Step 2: Write updated index with deduped old and new links back to disk - write_main_index(links=all_links, out_dir=out_dir) + # If we're going one level deeper, download each link and look for more links + if new_links and depth == 1: + log_crawl_started(new_links) + for new_link in new_links: + downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir) + new_links += parse_links_from_source(downloaded_file) + all_links, new_links = dedupe_links(all_links, new_links) + write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) if index_only: return all_links - - # Step 3: Run the archive methods for each link - links = all_links if update_all else new_links - log_archiving_started(len(links)) - idx: int = 0 - link: Link = None # type: ignore - try: - for idx, link in enumerate(links): - archive_link(link, out_dir=link.link_dir) - except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link.timestamp if link else '0') - raise SystemExit(0) - - except: - print() - raise - - log_archiving_finished(len(links)) + # Run the archive methods for each link + to_archive = all_links if update_all else new_links + archive_links(to_archive, out_dir=out_dir) # Step 4: Re-write links index with updated titles, icons, and resources - all_links = load_main_index(out_dir=out_dir) - write_main_index(links=list(all_links), out_dir=out_dir, finished=True) + if to_archive: + all_links = load_main_index(out_dir=out_dir) + write_main_index(links=list(all_links), out_dir=out_dir, finished=True) return all_links @enforce_types @@ -671,23 +666,8 @@ def update(resume: Optional[float]=None, return all_links # Step 3: Run the archive methods for each link - links = new_links if only_new else all_links - log_archiving_started(len(links), resume) - idx: int = 0 - link: Link = None # type: ignore - try: - for idx, link in enumerate(links_after_timestamp(links, resume)): - archive_link(link, overwrite=overwrite, out_dir=link.link_dir) - - except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link.timestamp if link else '0') - raise SystemExit(0) - - except: - print() - raise - - log_archiving_finished(len(links)) + to_archive = new_links if only_new else all_links + archive_links(to_archive, out_dir=out_dir) # Step 4: Re-write links index with updated titles, icons, and resources all_links = load_main_index(out_dir=out_dir) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 479d4e2c..eabaece2 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -29,7 +29,7 @@ from ..util import ( URL_REGEX, ) from ..index.schema import Link -from ..cli.logging import pretty_path, TimedProgress +from ..cli.logging import pretty_path, TimedProgress, log_source_saved from .pocket_html import parse_pocket_html_export from .pinboard_rss import parse_pinboard_rss_export from .shaarli_rss import parse_shaarli_rss_export @@ -83,36 +83,22 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]: @enforce_types -def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str: - check_data_folder(out_dir=out_dir) - - sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) - +def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str: ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts)) + source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts)) atomic_write(source_path, raw_text) + log_source_saved(source_file=source_path) return source_path @enforce_types -def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str: +def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str: """download a given url's content into output/sources/domain-.txt""" - check_data_folder(out_dir=out_dir) - - sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) - ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts)) + source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts)) if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): # Source is a URL that needs to be downloaded - source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts)) print('{}[*] [{}] Downloading {}{}'.format( ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), @@ -140,7 +126,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI atomic_write(source_path, raw_source_text) - print(' > {}'.format(pretty_path(source_path))) + log_source_saved(source_file=source_path) return source_path From 4c4b1e6a4bde5edb9e11942245a21437e73fe6df Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:33:35 -0400 Subject: [PATCH 154/333] fix link creation --- archivebox/index/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 80203980..b120738c 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -29,7 +29,7 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: with transaction.atomic(): for link in links: info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} - Snapshot.objects.update_or_create(url=url, defaults=info) + Snapshot.objects.update_or_create(url=link.url, defaults=info) @enforce_types def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None: From d159e674e1fb7005f1732f78adbd5cf5aa49436a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:41:18 -0400 Subject: [PATCH 155/333] write stderr instead of stdout for version info --- archivebox/cli/logging.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/cli/logging.py b/archivebox/cli/logging.py index a12c4e98..d11ffd9e 100644 --- a/archivebox/cli/logging.py +++ b/archivebox/cli/logging.py @@ -156,15 +156,15 @@ def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional from ..config import VERSION, ANSI cmd = ' '.join(('archivebox', subcommand, *subcommand_args)) stdin_hint = ' < /dev/stdin' if not stdin.isatty() else '' - print('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format( + stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format( now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), VERSION=VERSION, cmd=cmd, stdin_hint=stdin_hint, **ANSI, )) - print('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI)) - print() + stderr('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI)) + stderr() ### Parsing Stage From b4ce20cbe5b3d41676a43a337e0e12a869e53aac Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:41:27 -0400 Subject: [PATCH 156/333] write link details json before and after archiving --- archivebox/extractors/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index c08e7c0c..c9685a80 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -56,6 +56,7 @@ def archive_link(link: Link, overwrite: bool=False, out_dir: Optional[str]=None) os.makedirs(out_dir) link = load_link_details(link, out_dir=out_dir) + write_link_details(link, out_dir=link.link_dir) log_link_archiving_started(link, out_dir, is_new) link = link.overwrite(updated=datetime.now()) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} From 215d5eae324d9da3ffb758bf5e47f7b31d942e9a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:41:37 -0400 Subject: [PATCH 157/333] normal git clone instead of mirror --- archivebox/extractors/git.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index 1534ce34..dcb1df3c 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -56,7 +56,7 @@ def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A cmd = [ GIT_BINARY, 'clone', - '--mirror', + # '--mirror', '--recursive', *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), without_query(without_fragment(link.url)), From ae208435c9c979720fad8f7782d6c74247b6c069 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 12:21:37 -0400 Subject: [PATCH 158/333] fix the add links form --- archivebox/cli/logging.py | 2 +- archivebox/core/admin.py | 2 +- archivebox/core/forms.py | 7 +++++-- archivebox/core/views.py | 4 ++-- archivebox/extractors/git.py | 1 - archivebox/themes/default/add_links.html | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/archivebox/cli/logging.py b/archivebox/cli/logging.py index d11ffd9e..f002e922 100644 --- a/archivebox/cli/logging.py +++ b/archivebox/cli/logging.py @@ -191,7 +191,7 @@ def log_deduping_finished(num_new_links: int): def log_crawl_started(new_links): - print('{lightblue}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI)) + print('{lightred}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI)) ### Indexing Stage diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 7942c6c2..1b05c580 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -49,7 +49,7 @@ class SnapshotAdmin(admin.ModelAdmin): '📼 ' '📦 ' '🏛 ' - '' + '
' '{}', obj.archive_path, canon['wget_path'] or '', obj.archive_path, canon['pdf_path'], diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 5f67e2c6..8bf0cbd0 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -1,7 +1,10 @@ from django import forms -CHOICES = (('url', 'URL'), ('feed', 'Feed')) +CHOICES = ( + ('0', 'depth=0 (archive just this url)'), + ('1', 'depth=1 (archive this url and all sites one link away)'), +) class AddLinkForm(forms.Form): url = forms.URLField() - source = forms.ChoiceField(choices=CHOICES, widget=forms.RadioSelect, initial='url') + depth = forms.ChoiceField(choices=CHOICES, widget=forms.RadioSelect, initial='0') diff --git a/archivebox/core/views.py b/archivebox/core/views.py index d9c51700..5fb43119 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -66,9 +66,9 @@ class AddLinks(View): if form.is_valid(): url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') - depth = 0 if form.cleaned_data["source"] == "url" else 1 + depth = 0 if form.cleaned_data["depth"] == "0" else 0 input_kwargs = { - "url": url, + "urls": url, "depth": depth, "update_all": False, "out_dir": OUTPUT_DIR, diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index dcb1df3c..c8a5eeaf 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -56,7 +56,6 @@ def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> A cmd = [ GIT_BINARY, 'clone', - # '--mirror', '--recursive', *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), without_query(without_fragment(link.url)), diff --git a/archivebox/themes/default/add_links.html b/archivebox/themes/default/add_links.html index 7143c576..6e35f38c 100644 --- a/archivebox/themes/default/add_links.html +++ b/archivebox/themes/default/add_links.html @@ -212,7 +212,7 @@ - Go back to Snapshot list + Go back to Main Index From a79dd4685a2bea2f6d9b94a79215d28eb72ba722 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 12:21:52 -0400 Subject: [PATCH 159/333] make snapshots unique again --- .../migrations/0004_auto_20200713_1552.py | 19 +++++++++++++++++++ archivebox/core/models.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 archivebox/core/migrations/0004_auto_20200713_1552.py diff --git a/archivebox/core/migrations/0004_auto_20200713_1552.py b/archivebox/core/migrations/0004_auto_20200713_1552.py new file mode 100644 index 00000000..69836623 --- /dev/null +++ b/archivebox/core/migrations/0004_auto_20200713_1552.py @@ -0,0 +1,19 @@ +# Generated by Django 3.0.7 on 2020-07-13 15:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0003_auto_20200630_1034'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='timestamp', + field=models.CharField(db_index=True, default=None, max_length=32, unique=True), + preserve_default=False, + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 42929e5a..7ac9427b 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -13,7 +13,7 @@ class Snapshot(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) url = models.URLField(unique=True) - timestamp = models.CharField(max_length=32, null=True, default=None, db_index=True) + timestamp = models.CharField(max_length=32, unique=True, db_index=True) title = models.CharField(max_length=128, null=True, default=None, db_index=True) tags = models.CharField(max_length=256, null=True, default=None, db_index=True) From 5e2bf73f047f2a647f1497a98aedc4cf76f12832 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 13 Jul 2020 14:48:25 -0500 Subject: [PATCH 160/333] fix: Bugs related to add() refactor --- archivebox/index/__init__.py | 6 +++++- archivebox/main.py | 10 ++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 7ea473d7..cd50a185 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -292,7 +292,6 @@ def dedupe_links(existing_links: List[Link], new_links: List[Link]) -> Tuple[List[Link], List[Link]]: from ..parsers import parse_links - # merge existing links in out_dir and new links all_links = validate_links(existing_links + new_links) all_link_urls = {link.url for link in existing_links} @@ -301,6 +300,11 @@ def dedupe_links(existing_links: List[Link], link for link in new_links if link.url not in all_link_urls ] + + all_links_deduped = {link.url: link for link in all_links} + for i in range(len(new_links)): + if new_links[i].url in all_links_deduped.keys(): + new_links[i] = all_links_deduped[new_links[i].url] log_deduping_finished(len(new_links)) return all_links, new_links diff --git a/archivebox/main.py b/archivebox/main.py index 54b71acc..999e4650 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -520,18 +520,16 @@ def add(urls: Union[str, List[str]], write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) new_links += parse_links_from_source(write_ahead_log) - all_links, new_links = dedupe_links(all_links, new_links) - write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) - # If we're going one level deeper, download each link and look for more links + new_links_depth = [] if new_links and depth == 1: log_crawl_started(new_links) for new_link in new_links: downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir) - new_links += parse_links_from_source(downloaded_file) - all_links, new_links = dedupe_links(all_links, new_links) - write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) + new_links_depth += parse_links_from_source(downloaded_file) + all_links, new_links = dedupe_links(all_links, new_links + new_links_depth) + write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) if index_only: return all_links From 98dda688970c8993a7a79847ea74ff5e30964b4f Mon Sep 17 00:00:00 2001 From: apkallum Date: Tue, 14 Jul 2020 10:26:33 -0400 Subject: [PATCH 161/333] fix: timestamp comparison in to_json function --- archivebox/index/schema.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index db17c269..eb6ef894 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -190,7 +190,10 @@ class Link: for key, val in json_info.items() if key in cls.field_names() } - info['updated'] = parse_date(info.get('updated')) + try: + info['updated'] = int(parse_date(info.get('updated'))) # Cast to int which comes with rounding down + except (ValueError, TypeError): + info['updated'] = None info['sources'] = info.get('sources') or [] json_history = info.get('history') or {} From f845224d6f60e59ee53981885c400eb83a03fb12 Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 16 Jul 2020 09:20:33 -0500 Subject: [PATCH 162/333] fix: htmlencode titles before rendering the static html index and detail --- archivebox/index/html.py | 4 +- .../templates/title_with_html.com.html | 699 ++++++++++++++++++ tests/test_title.py | 14 + 3 files changed, 715 insertions(+), 2 deletions(-) create mode 100644 tests/mock_server/templates/title_with_html.com.html create mode 100644 tests/test_title.py diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 60d41049..e21ae576 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -90,7 +90,7 @@ def main_index_row_template(link: Link) -> str: **link._asdict(extended=True), # before pages are finished archiving, show loading msg instead of title - 'title': ( + 'title': htmlencode( link.title or (link.base_url if link.is_archived else TITLE_LOADING_MSG) ), @@ -129,7 +129,7 @@ def link_details_template(link: Link) -> str: return render_legacy_template(LINK_DETAILS_TEMPLATE, { **link_info, **link_info['canonical'], - 'title': ( + 'title': htmlencode( link.title or (link.base_url if link.is_archived else TITLE_LOADING_MSG) ), diff --git a/tests/mock_server/templates/title_with_html.com.html b/tests/mock_server/templates/title_with_html.com.html new file mode 100644 index 00000000..e84dcaa0 --- /dev/null +++ b/tests/mock_server/templates/title_with_html.com.html @@ -0,0 +1,699 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It All Starts with a Humble <textarea> ◆ 24 ways + + +
+ Skip to content +

+ 24 ways + to impress your friends + +

+
+
+ + + +
+ + +
+
+
+

It All Starts with a Humble <textarea>

+ +
+ +
+
    +
  • + +
  • + + +
  • Published in + UX +
  • + + +
  • + No comments +
  • +
+
+ +
+ +
+

Those that know me well know that I make + a lot + of + side projects. I most definitely make too many, but there’s one really useful thing about making lots of side projects: it allows me to experiment in a low-risk setting. +

+

Side projects also allow me to accidentally create a context where I can demonstrate a really affective, long-running methodology for building on the web: + progressive enhancement. That context is a little Progressive Web App that I’m tinkering with called + Jotter. It’s incredibly simple, but under the hood, there’s a really solid experience built on top of a + minimum viable experience + which after reading this article, you’ll hopefully apply this methodology to your own work.

+
+ The Jotter Progressive Web App presented in the Google Chrome browser. + +
+

What is a minimum viable experience?

+

The key to progressive enhancement is distilling the user experience to its lowest possible technical solution and then building on it to improve the user experience. In the context of + Jotter, that is a humble + <textarea> + element. That humble + <textarea> + is our + minimum viable experience. +

+

Let me show you how it’s built up, progressively real quick. If you disable CSS and JavaScript, you get this:

+
+ The Jotter Progressive Web App with CSS and JavaScript disabled shows a HTML only experience. + +
+

This result is great because I know that regardless of what happens, the user can do what they needed to do when the loaded Jotter in their browser: take some notes. That’s our + minimum viable experience, completed with a few lines of code that work in + every single browser—even very old browsers. Don’t you just love good ol’ HTML? +

+

Now it’s time to enhance that minimum viable experience, + progressively. It’s a good idea to do that in smaller steps rather than just provide a 0% experience or a 100% experience, which is the approach that’s often favoured by JavaScript framework enthusiasts. I think that process is counter-intuitive to the web, though, so building up from a minimum viable experience is the optimal way to go, in my opinion. +

+

Understanding how a + minimum viable experience + works can be a bit tough, admittedly, so I like to use a the following diagram to explain the process:

+
+ Minimum viable experience diagram which is described in the next paragraph. + +
+

Let me break down this diagram for both folks who can and can’t see it. On the top row, there’s four stages of a broken-up car, starting with just a wheel, all the way up to a fully functioning car. The car enhances only in a way that it is still + mostly useless + until it gets to its final form when the person is finally happy. +

+

On the second row, instead of building a car, we start with a skateboard which immediately does the job of getting the person from point A to point B. This enhances to a Micro Scooter and then to a Push Bike. Its final form is a fancy looking Motor Scooter. I choose that instead of a car deliberately because generally, when you progressively enhance a project, it turns out to be + way simpler and lighter + than a project that was built without progressive enhancement in mind.

+

Now that we know what a minimum viable experience is and how it works, let’s apply this methodology to Jotter! +

+

Add some CSS

+

The first enhancement is CSS. Jotter has a very simple design, which is mostly a full height + <textarea> + with a little sidebar. A flexbox-based, auto-stacking layout, inspired by a layout called + The Sidebar + is used and we’re good to go. +

+

Based on the diagram from earlier, we can comfortably say we’re in + Skateboard + territory now.

+

Add some JavaScript

+

We’ve got styles now, so let’s + enhance + the experience again. A user can currently load up the site and take notes. If the CSS loads, it’ll be a more pleasant experience, but if they refresh their browser, they’re going to lose all of their work.

+

We can fix that by adding some + local storage + into the mix. +

+

The functionality flow is pretty straightforward. As a user inputs content, the JavaScript listens to an + input + event and pushes the content of the + <textarea> + into + localStorage. If we then set that + localStorage + data to populate the + <textarea> + on load, that user’s experience is suddenly + enhanced + because they can’t lose their work by accidentally refreshing. +

+

The JavaScript is incredibly light, too: +

+
const textArea = document.querySelector('textarea');
+const storageKey = 'text';
+
+const init = () => {
+
+  textArea.value = localStorage.getItem(storageKey);
+
+  textArea.addEventListener('input', () => {
+    localStorage.setItem(storageKey, textArea.value);
+  });
+}
+
+init();
+

In around 13 lines of code (which you can see a + working demo here), we’ve been able to enhance the user’s experience + considerably, and if we think back to our diagram from earlier, we are very much in + Micro Scooter + territory now. +

+

Making it a PWA

+

We’re in really good shape now, so let’s turn Jotter into a + Motor Scooter + and make this thing work offline as an installable Progressive Web App (PWA). +

+

Making a PWA is really achievable and Google have even produced a + handy checklist + to help you get going. You can also get guidance from a + Lighthouse audit. +

+

For this little app, all we need is a + manifest + and a + Service Worker + to cache assets and serve them offline for us if needed.

+

The Service Worker is actually pretty slim, so here it is in its entirety: +

+
const VERSION = '0.1.3';
+const CACHE_KEYS = {
+  MAIN: `main-${VERSION}`
+};
+
+// URLS that we want to be cached when the worker is installed
+const PRE_CACHE_URLS = ['/', '/css/global.css', '/js/app.js', '/js/components/content.js'];
+
+/**
+ * Takes an array of strings and puts them in a named cache store
+ *
+ * @param {String} cacheName
+ * @param {Array} items=[]
+ */
+const addItemsToCache = function(cacheName, items = []) {
+  caches.open(cacheName).then(cache => cache.addAll(items));
+};
+
+self.addEventListener('install', evt => {
+  self.skipWaiting();
+
+  addItemsToCache(CACHE_KEYS.MAIN, PRE_CACHE_URLS);
+});
+
+self.addEventListener('activate', evt => {
+  // Look for any old caches that don't match our set and clear them out
+  evt.waitUntil(
+    caches
+      .keys()
+      .then(cacheNames => {
+        return cacheNames.filter(item => !Object.values(CACHE_KEYS).includes(item));
+      })
+      .then(itemsToDelete => {
+        return Promise.all(
+          itemsToDelete.map(item => {
+            return caches.delete(item);
+          })
+        );
+      })
+      .then(() => self.clients.claim())
+  );
+});
+
+self.addEventListener('fetch', evt => {
+  evt.respondWith(
+    caches.match(evt.request).then(cachedResponse => {
+      // Item found in cache so return
+      if (cachedResponse) {
+        return cachedResponse;
+      }
+
+      // Nothing found so load up the request from the network
+      return caches.open(CACHE_KEYS.MAIN).then(cache => {
+        return fetch(evt.request)
+          .then(response => {
+            // Put the new response in cache and return it
+            return cache.put(evt.request, response.clone()).then(() => {
+              return response;
+            });
+          })
+          .catch(ex => {
+            return;
+          });
+      });
+    })
+  );
+});
+

What the Service Worker does here is pre-cache our core assets that we define in PRE_CACHE_URLS. Then, for each fetch event which is called per request, it’ll try to fulfil the request from cache first. If it can’t do that, it’ll load the remote request for us. With this setup, we achieve two things:

+
    +
  1. We get offline support because we stick our critical assets in cache immediately so they will be accessible offline
  2. +
  3. Once those critical assets and any other requested assets are cached, the app will run faster by default
  4. +
+

Importantly now, because we have a manifest, some shortcut icons and a Service Worker that gives us offline support, we have a fully installable PWA!

+

Wrapping up

+

I hope with this simplified example you can see how approaching web design and development with a progressive enhancement approach, everyone gets an acceptable experience instead of those who are lucky enough to get every aspect of the page at the right time.

+

Jotter is very much live and in the process of being enhanced further, which you can see on its little in-app roadmap, so go ahead and play around with it.

+

Before you know it, it’ll be a car itself, but remember: it’ll always start as a humble little <textarea>.

+
+
+ +
+
+

About the author

+
+
+
+ +

Andy Bell is an independent designer and front-end developer who’s trying to make everyone’s experience on the web better with a focus on progressive enhancement and accessibility.

+

More articles by Andy

+ +
+
+
+ + + + + + + + + + + + + +
+
+

Comments

+
+ +
+ + + + +
+
+ diff --git a/tests/test_title.py b/tests/test_title.py new file mode 100644 index 00000000..b5090844 --- /dev/null +++ b/tests/test_title.py @@ -0,0 +1,14 @@ +from .fixtures import * + +def test_title_is_htmlencoded_in_index_html(tmp_path, process): + """ + https://github.com/pirate/ArchiveBox/issues/330 + Unencoded content should not be rendered as it facilitates xss injections + and breaks the layout. + """ + add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True) + + with open(tmp_path / "index.html", "r") as f: + output_html = f.read() + + assert "