wikipedia-mirror/main.py

118 lines
4.4 KiB
Python
Raw Permalink Normal View History

2024-05-11 19:16:26 +02:00
from quart import Quart, Response, request
import aiohttp, aiofiles
import aiofiles.os
from time import time
from binascii import crc32
import re
app = Quart(__name__, static_folder=None)
base_url = 'wikipedia.localhost:5000'
def fix_url(text):
pattern = r'((https:|http:)?//([\w-]+\.)+)(wikipedia|wikimedia)\.org'
def repl(match):
protocol = match.group(2)
if protocol:
return 'http://' + match.group(3) + base_url
else:
return '//' + match.group(3) + base_url
return re.sub(pattern, repl, text)
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
async def proxy(path):
host = request.headers.get('Host')
dep = host.split('.')[0]
if not host.endswith(base_url):
return '', 200
elif host == base_url:
url = 'https://wikipedia.org'
dep = '_main'
elif dep in ['intake-analytics']:
return '', 200
elif dep in ['upload', 'meta', 'login', 'maps', 'foundation', 'developer']:
url = f'https://{dep}.wikimedia.org'
else:
url = f'https://{dep}.wikipedia.org'
cache_path = 'cache/' + dep + '/' + path
if len(request.args) > 0:
path += '?' + '&'.join([k+'='+v for k,v in request.args.items()])
cache_path += '_v' + str(crc32(path.encode()))
if any([(i in ['..', '.']) for i in path.split('/')]):
return 'Forbidden', 403
cached = await aiofiles.os.path.exists(cache_path + '/_file')
cached = False
if cached:
async with aiofiles.open(cache_path + '/_file', mode='rb') as f:
timestamp = int.from_bytes(await f.read(8))
status = int.from_bytes(await f.read(2))
headers = {}
for i in range(int.from_bytes(await f.read(1))):
key_length = int.from_bytes(await f.read(1))
key = await f.read(key_length)
key = key.decode()
value_length = int.from_bytes(await f.read(1))
value = await f.read(value_length)
value = value.decode()
headers[key] = value
headers['age'] = int(time() - timestamp)
headers['x-rp-cache'] = 'HIT'
cache_max_age = 0
if 'cache-control' in headers:
cache_max_age = int([i.strip() for i in headers['cache-control'].split(',') if i.strip().startswith('max-age=')][0][8:])
if headers['age'] < cache_max_age:
content = await f.read(int(headers['content-length']))
r = Response(content, status=status, headers=headers)
return r
async with aiohttp.ClientSession() as session:
async with session.get(url + '/' + path, allow_redirects=False) as response:
headers = response.headers.items()
headers = {k.lower(): v for k,v in headers if k.lower() in ['cache-control', 'age', 'content-language', 'content-type', 'last-modified', 'date', 'x-content-type-options', 'location']}
content = await response.content.read()
if 'location' in headers:
print(headers['location'])
print(request.url_root)
headers['location'] = fix_url(headers['location'])
if any([headers['content-type'].startswith(t) for t in ['text/html', 'text/javascript', 'application']]):
content = fix_url(content.decode()).encode()
headers['content-length'] = str(len(content))
await aiofiles.os.makedirs(cache_path, exist_ok=True)
async with aiofiles.open(cache_path + '/_file', mode='wb') as f:
await f.write(int.to_bytes(int(time()-int(headers['age'])), 8))
await f.write(int.to_bytes(response.status, 2))
await f.write(int.to_bytes(len(headers)))
for k, v in headers.items():
key_encoded = k.encode()
await f.write(int.to_bytes(len(key_encoded)))
await f.write(key_encoded)
value_encoded = v.encode()
await f.write(int.to_bytes(len(value_encoded)))
await f.write(value_encoded)
await f.write(content)
print('cache written')
r = Response(content, status=response.status, headers=headers)
return r
if __name__ == "__main__":
app.debug = True
app.run()