Sufficient update

This commit is contained in:
Minecon724 2025-01-17 15:16:51 +01:00
parent 3852e1d425
commit 3b7ecfaa69
Signed by: Minecon724
GPG key ID: 3CCC4D267742C8E8
5 changed files with 49 additions and 16 deletions

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
venv/ venv/
cache/ cache/
__pycache__/

12
Dockerfile Normal file
View file

@ -0,0 +1,12 @@
FROM python:3.13-alpine
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
COPY src ./src
EXPOSE 8000
RUN useradd app
USER app
CMD ["hypercorn", "src.__main__:app", "--host", "0.0.0.0"]

View file

@ -1,7 +1,18 @@
Change `base_url` and run it as you would run a quart app Change `BASE_URL` environment variable to your URL!
This is a MITM between you and wikipedia, basically it impersonates wikipedia. Production:
I don't know if that's legal, but I don't think you're going to get sued. ```bash
BASE_URL=wikipedia.localhost:8000 hypercorn src.__main__:app
```
It's easier to maintain this way. If somebody turns out to have a problem, I will do something about it Debug:
```bash
BASE_URL=wikipedia.localhost:8000 python3 src/
```
## Privacy
Nothing is intentionally logged, however, it's possible to tell what pages were accessed by the instance and when, due to cache. And leaking happens.

View file

@ -1,2 +1,2 @@
quart
aiohttp[speedups] aiohttp[speedups]
quart

View file

@ -3,17 +3,22 @@ import aiohttp, aiofiles
import aiofiles.os import aiofiles.os
from time import time from time import time
from binascii import crc32 from binascii import crc32
from os import getenv
import re import re
app = Quart(__name__, static_folder=None) app = Quart(__name__, static_folder=None)
base_url = 'wikipedia.localhost:5000' base_proto = 'http'
base_url = 'wikipedia.localhost:8000'
def fix_url(text): print("Base URL is " + base_proto + "://" + base_url)
def fix_url(text: str) -> str:
pattern = r'((https:|http:)?//([\w-]+\.)+)(wikipedia|wikimedia)\.org' pattern = r'((https:|http:)?//([\w-]+\.)+)(wikipedia|wikimedia)\.org'
def repl(match): def repl(match):
protocol = match.group(2) protocol = match.group(2)
if protocol: if protocol:
return 'http://' + match.group(3) + base_url return base_proto + '://' + match.group(3) + base_url
else: else:
return '//' + match.group(3) + base_url return '//' + match.group(3) + base_url
return re.sub(pattern, repl, text) return re.sub(pattern, repl, text)
@ -22,7 +27,10 @@ def fix_url(text):
@app.route('/<path:path>') @app.route('/<path:path>')
async def proxy(path): async def proxy(path):
host = request.headers.get('Host') host = request.headers.get('Host')
dep = host.split('.')[0]
dep = host[:-len(base_url)]
if dep != '':
dep = dep[:-1] # remove dot
if not host.endswith(base_url): if not host.endswith(base_url):
return '', 200 return '', 200
@ -83,19 +91,21 @@ async def proxy(path):
headers = {k.lower(): v for k,v in headers if k.lower() in ['cache-control', 'age', 'content-language', 'content-type', 'last-modified', 'date', 'x-content-type-options', 'location']} headers = {k.lower(): v for k,v in headers if k.lower() in ['cache-control', 'age', 'content-language', 'content-type', 'last-modified', 'date', 'x-content-type-options', 'location']}
content = await response.content.read() content = await response.content.read()
if 'location' in headers: if 'location' in headers: # if a redirect
print(headers['location'])
print(request.url_root)
headers['location'] = fix_url(headers['location']) headers['location'] = fix_url(headers['location'])
if any([headers['content-type'].startswith(t) for t in ['text/html', 'text/javascript', 'application']]): try:
content = fix_url(content.decode()).encode() content = fix_url(content.decode()).encode()
except: # if file is not a text file
pass
headers['content-length'] = str(len(content)) headers['content-length'] = str(len(content))
headers['x-rp-cache'] = 'MISS'
await aiofiles.os.makedirs(cache_path, exist_ok=True) await aiofiles.os.makedirs(cache_path, exist_ok=True)
async with aiofiles.open(cache_path + '/_file', mode='wb') as f: async with aiofiles.open(cache_path + '/_file', mode='wb') as f:
await f.write(int.to_bytes(int(time()-int(headers['age'])), 8)) await f.write(int.to_bytes(int(time() - int(headers['age'])), 8))
await f.write(int.to_bytes(response.status, 2)) await f.write(int.to_bytes(response.status, 2))
await f.write(int.to_bytes(len(headers))) await f.write(int.to_bytes(len(headers)))
for k, v in headers.items(): for k, v in headers.items():
@ -107,11 +117,10 @@ async def proxy(path):
await f.write(int.to_bytes(len(value_encoded))) await f.write(int.to_bytes(len(value_encoded)))
await f.write(value_encoded) await f.write(value_encoded)
await f.write(content) await f.write(content)
print('cache written')
r = Response(content, status=response.status, headers=headers) r = Response(content, status=response.status, headers=headers)
return r return r
if __name__ == "__main__": if __name__ == "__main__":
app.debug = True app.debug = True
app.run() app.run(port=8000)