Update rss_proxy.py

* remove guid rewrite
This commit is contained in:
Slava
2025-03-10 07:33:38 +00:00
parent 99133daeed
commit a3dff4a339

View File

@@ -1,116 +1,106 @@
import html import html
import urllib.parse import urllib.parse
import os import os
import requests import requests
import redis import redis
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import re import re
import unicodedata import unicodedata
from flask import request, Response from flask import request, Response
PROXY_URL = os.getenv("PROXY_URL") PROXY_URL = os.getenv("PROXY_URL")
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0") REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
CACHE_TTL = int(os.getenv("CACHE_TTL", 3600)) CACHE_TTL = int(os.getenv("CACHE_TTL", 3600))
rdb = redis.from_url(REDIS_URL) rdb = redis.from_url(REDIS_URL)
_head_html = f"""<?xml version="1.0" encoding="UTF-8"?> _head_html = f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"> <rss version="2.0">
<channel><title>Tapochek.net RSS</title> <channel><title>Tapochek.net RSS</title>
<link>http://tapochek.net/</link> <link>http://tapochek.net/</link>
<ttl>15</ttl>""" <ttl>15</ttl>"""
def normalize_text(text): def normalize_text(text):
"""Приводит текст к нормальному виду, устраняя странные символы.""" """Приводит текст к нормальному виду, устраняя странные символы."""
if text: if text:
return unicodedata.normalize("NFKC", text) return unicodedata.normalize("NFKC", text)
return text return text
def extract_viewtopic_link(description): def extract_viewtopic_link(description):
decoded_description = html.unescape(description) decoded_description = html.unescape(description)
match = re.search(r'href="(https?://[^"]+)"', decoded_description) match = re.search(r'href="(https?://[^"]+)"', decoded_description)
return match.group(1) if match else None return match.group(1) if match else None
def normalize_xml_texts(elem): def normalize_xml_texts(elem):
"""Применяет normalize_text ко всем текстовым узлам XML.""" """Применяет normalize_text ко всем текстовым узлам XML."""
if elem.text: if elem.text:
elem.text = normalize_text(elem.text) elem.text = normalize_text(elem.text)
if elem.tail: if elem.tail:
elem.tail = normalize_text(elem.tail) elem.tail = normalize_text(elem.tail)
for child in elem: for child in elem:
normalize_xml_texts(child) normalize_xml_texts(child)
def init_proxy(app): def init_proxy(app):
@app.route("/proxy") @app.route("/proxy")
def proxy(): def proxy():
"""Proxy RSS feed with per-item caching and GUID replacement.""" """Proxy RSS feed with per-item caching and GUID replacement."""
raw_query = request.query_string.decode() raw_query = request.query_string.decode()
if raw_query.startswith("url="): if raw_query.startswith("url="):
url = urllib.parse.unquote(raw_query[4:]) url = urllib.parse.unquote(raw_query[4:])
url = html.unescape(url) url = html.unescape(url)
else: else:
return "Missing URL", 400 return "Missing URL", 400
try: try:
proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None
r = requests.get(url, timeout=10, proxies=proxies) r = requests.get(url, timeout=10, proxies=proxies)
xml_data = r.text xml_data = r.text
xml_data = xml_data.replace("&", "&amp;") xml_data = xml_data.replace("&", "&amp;")
_encoding = xml_data.split('encoding="')[1].split('"')[0] _encoding = xml_data.split('encoding="')[1].split('"')[0]
if '<?xml version="1.0" encoding="' in xml_data: if '<?xml version="1.0" encoding="' in xml_data:
xml_data = xml_data.replace( xml_data = xml_data.replace(
f'<?xml version="1.0" encoding="{_encoding}"?>', f'<?xml version="1.0" encoding="{_encoding}"?>',
'<?xml version="1.0" encoding="UTF-8"?>' '<?xml version="1.0" encoding="UTF-8"?>'
) )
if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data: if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data:
xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}' xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}'
root = ET.fromstring(xml_data) root = ET.fromstring(xml_data)
items = root.findall(".//item") items = root.findall(".//item")
cached_items = [] cached_items = []
new_items = [] new_items = []
for item in items: for item in items:
guid = item.find("guid") cache_key = f"rss:item:{guid.text}"
if guid is None or not guid.text: cached_item = rdb.get(cache_key)
continue
title = item.find("title")
cache_key = f"rss:item:{guid.text}" new_title = re.sub(r'&#\d+', '', title.text)
cached_item = rdb.get(cache_key) title.text = new_title
title = item.find("title") if cached_item:
new_title = re.sub(r'&#\d+', '', title.text) cached_items.append(cached_item.decode())
title.text = new_title else:
description = item.find("description")
if cached_item:
cached_items.append(cached_item.decode()) normalize_xml_texts(item)
else:
description = item.find("description") item_str = ET.tostring(item, encoding="unicode")
if description is not None: item_str = html.unescape(item_str)
new_guid = extract_viewtopic_link(description.text) rdb.setex(cache_key, CACHE_TTL, item_str)
if new_guid: new_items.append(item_str)
print(f"Заменяю GUID: {guid.text}{new_guid}")
guid.attrib.clear() final_items = cached_items + new_items
guid.text = new_guid response_xml = f"""{_head_html}{"".join(final_items)}</channel></rss>"""
normalize_xml_texts(item) return Response(response_xml, content_type="application/xml; charset=utf-8")
item_str = ET.tostring(item, encoding="unicode") except Exception as e:
item_str = html.unescape(item_str) return f"Error: {e}", 500
rdb.setex(cache_key, CACHE_TTL, item_str)
new_items.append(item_str)
final_items = cached_items + new_items
response_xml = f"""{_head_html}{"".join(final_items)}</channel></rss>"""
return Response(response_xml, content_type="application/xml; charset=utf-8")
except Exception as e:
return f"Error: {e}", 500