from quart import Quart, Response, request import aiohttp, aiofiles import aiofiles.os from time import time from binascii import crc32 import re app = Quart(__name__, static_folder=None) base_url = 'wikipedia.localhost:5000' def fix_url(text): pattern = r'((https:|http:)?//([\w-]+\.)+)(wikipedia|wikimedia)\.org' def repl(match): protocol = match.group(2) if protocol: return 'http://' + match.group(3) + base_url else: return '//' + match.group(3) + base_url return re.sub(pattern, repl, text) @app.route('/', defaults={'path': ''}) @app.route('/') async def proxy(path): host = request.headers.get('Host') dep = host.split('.')[0] if not host.endswith(base_url): return '', 200 elif host == base_url: url = 'https://wikipedia.org' dep = '_main' elif dep in ['intake-analytics']: return '', 200 elif dep in ['upload', 'meta', 'login', 'maps', 'foundation', 'developer']: url = f'https://{dep}.wikimedia.org' else: url = f'https://{dep}.wikipedia.org' cache_path = 'cache/' + dep + '/' + path if len(request.args) > 0: path += '?' + '&'.join([k+'='+v for k,v in request.args.items()]) cache_path += '_v' + str(crc32(path.encode())) if any([(i in ['..', '.']) for i in path.split('/')]): return 'Forbidden', 403 cached = await aiofiles.os.path.exists(cache_path + '/_file') cached = False if cached: async with aiofiles.open(cache_path + '/_file', mode='rb') as f: timestamp = int.from_bytes(await f.read(8)) status = int.from_bytes(await f.read(2)) headers = {} for i in range(int.from_bytes(await f.read(1))): key_length = int.from_bytes(await f.read(1)) key = await f.read(key_length) key = key.decode() value_length = int.from_bytes(await f.read(1)) value = await f.read(value_length) value = value.decode() headers[key] = value headers['age'] = int(time() - timestamp) headers['x-rp-cache'] = 'HIT' cache_max_age = 0 if 'cache-control' in headers: cache_max_age = int([i.strip() for i in headers['cache-control'].split(',') if i.strip().startswith('max-age=')][0][8:]) if headers['age'] < cache_max_age: content = await f.read(int(headers['content-length'])) r = Response(content, status=status, headers=headers) return r async with aiohttp.ClientSession() as session: async with session.get(url + '/' + path, allow_redirects=False) as response: headers = response.headers.items() headers = {k.lower(): v for k,v in headers if k.lower() in ['cache-control', 'age', 'content-language', 'content-type', 'last-modified', 'date', 'x-content-type-options', 'location']} content = await response.content.read() if 'location' in headers: print(headers['location']) print(request.url_root) headers['location'] = fix_url(headers['location']) if any([headers['content-type'].startswith(t) for t in ['text/html', 'text/javascript', 'application']]): content = fix_url(content.decode()).encode() headers['content-length'] = str(len(content)) await aiofiles.os.makedirs(cache_path, exist_ok=True) async with aiofiles.open(cache_path + '/_file', mode='wb') as f: await f.write(int.to_bytes(int(time()-int(headers['age'])), 8)) await f.write(int.to_bytes(response.status, 2)) await f.write(int.to_bytes(len(headers))) for k, v in headers.items(): key_encoded = k.encode() await f.write(int.to_bytes(len(key_encoded))) await f.write(key_encoded) value_encoded = v.encode() await f.write(int.to_bytes(len(value_encoded))) await f.write(value_encoded) await f.write(content) print('cache written') r = Response(content, status=response.status, headers=headers) return r if __name__ == "__main__": app.debug = True app.run()