From 3b7ecfaa696028007d379c7dcfb7d1c346ebf455 Mon Sep 17 00:00:00 2001 From: Minecon724 Date: Fri, 17 Jan 2025 15:16:51 +0100 Subject: [PATCH] Sufficient update --- .gitignore | 1 + Dockerfile | 12 ++++++++++++ README.md | 19 +++++++++++++++---- requirements.txt | 2 +- main.py => src/__main__.py | 31 ++++++++++++++++++++----------- 5 files changed, 49 insertions(+), 16 deletions(-) create mode 100644 Dockerfile rename main.py => src/__main__.py (86%) diff --git a/.gitignore b/.gitignore index 16a179c..6dbc493 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ venv/ cache/ +__pycache__/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..73d7c26 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.13-alpine + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY src ./src +EXPOSE 8000 + +RUN useradd app +USER app + +CMD ["hypercorn", "src.__main__:app", "--host", "0.0.0.0"] \ No newline at end of file diff --git a/README.md b/README.md index 3b6f039..21f6ee4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,18 @@ -Change `base_url` and run it as you would run a quart app +Change `BASE_URL` environment variable to your URL! -This is a MITM between you and wikipedia, basically it impersonates wikipedia. +Production: -I don't know if that's legal, but I don't think you're going to get sued. +```bash +BASE_URL=wikipedia.localhost:8000 hypercorn src.__main__:app +``` -It's easier to maintain this way. If somebody turns out to have a problem, I will do something about it +Debug: + +```bash +BASE_URL=wikipedia.localhost:8000 python3 src/ +``` + + +## Privacy + +Nothing is intentionally logged, however, it's possible to tell what pages were accessed by the instance and when, due to cache. And leaking happens. diff --git a/requirements.txt b/requirements.txt index 666a289..d24addd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -quart aiohttp[speedups] +quart \ No newline at end of file diff --git a/main.py b/src/__main__.py similarity index 86% rename from main.py rename to src/__main__.py index dfb4967..c5e3314 100644 --- a/main.py +++ b/src/__main__.py @@ -3,17 +3,22 @@ import aiohttp, aiofiles import aiofiles.os from time import time from binascii import crc32 +from os import getenv import re app = Quart(__name__, static_folder=None) -base_url = 'wikipedia.localhost:5000' +base_proto = 'http' +base_url = 'wikipedia.localhost:8000' -def fix_url(text): +print("Base URL is " + base_proto + "://" + base_url) + +def fix_url(text: str) -> str: pattern = r'((https:|http:)?//([\w-]+\.)+)(wikipedia|wikimedia)\.org' + def repl(match): protocol = match.group(2) if protocol: - return 'http://' + match.group(3) + base_url + return base_proto + '://' + match.group(3) + base_url else: return '//' + match.group(3) + base_url return re.sub(pattern, repl, text) @@ -22,7 +27,10 @@ def fix_url(text): @app.route('/') async def proxy(path): host = request.headers.get('Host') - dep = host.split('.')[0] + + dep = host[:-len(base_url)] + if dep != '': + dep = dep[:-1] # remove dot if not host.endswith(base_url): return '', 200 @@ -83,19 +91,21 @@ async def proxy(path): headers = {k.lower(): v for k,v in headers if k.lower() in ['cache-control', 'age', 'content-language', 'content-type', 'last-modified', 'date', 'x-content-type-options', 'location']} content = await response.content.read() - if 'location' in headers: - print(headers['location']) - print(request.url_root) + if 'location' in headers: # if a redirect headers['location'] = fix_url(headers['location']) - if any([headers['content-type'].startswith(t) for t in ['text/html', 'text/javascript', 'application']]): + try: content = fix_url(content.decode()).encode() + except: # if file is not a text file + pass headers['content-length'] = str(len(content)) + headers['x-rp-cache'] = 'MISS' await aiofiles.os.makedirs(cache_path, exist_ok=True) + async with aiofiles.open(cache_path + '/_file', mode='wb') as f: - await f.write(int.to_bytes(int(time()-int(headers['age'])), 8)) + await f.write(int.to_bytes(int(time() - int(headers['age'])), 8)) await f.write(int.to_bytes(response.status, 2)) await f.write(int.to_bytes(len(headers))) for k, v in headers.items(): @@ -107,11 +117,10 @@ async def proxy(path): await f.write(int.to_bytes(len(value_encoded))) await f.write(value_encoded) await f.write(content) - print('cache written') r = Response(content, status=response.status, headers=headers) return r if __name__ == "__main__": app.debug = True - app.run() + app.run(port=8000)