Files
rss-proxy/proxy/rss_proxy.py
bacon 0bc4778fca update rss_proxy.py
* testing fixes description
2025-03-05 20:49:45 +03:00

120 lines
4.2 KiB
Python
Executable File

import html
import urllib.parse
import os
import requests
import redis
import xml.etree.ElementTree as ET
import re
import unicodedata
from flask import request, Response
PROXY_URL = os.getenv("PROXY_URL")
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
CACHE_TTL = int(os.getenv("CACHE_TTL", 3600))
rdb = redis.from_url(REDIS_URL)
_head_html = f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel><title>Tapochek.net RSS</title>
<link>http://tapochek.net/</link>
<ttl>15</ttl>"""
def normalize_text(text):
"""Приводит текст к нормальному виду, устраняя странные символы."""
if text:
return unicodedata.normalize("NFKC", text)
return text
def extract_viewtopic_link(description):
decoded_description = html.unescape(description)
match = re.search(r'href="(https?://[^"]+)"', decoded_description)
return match.group(1) if match else None
def normalize_xml_texts(elem):
"""Применяет normalize_text ко всем текстовым узлам XML."""
if elem.text:
elem.text = normalize_text(elem.text)
if elem.tail:
elem.tail = normalize_text(elem.tail)
for child in elem:
normalize_xml_texts(child)
def init_proxy(app):
@app.route("/proxy")
def proxy():
"""Proxy RSS feed with per-item caching and GUID replacement."""
raw_query = request.query_string.decode()
if raw_query.startswith("url="):
url = urllib.parse.unquote(raw_query[4:])
else:
return "Missing URL", 400
try:
proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None
r = requests.get(url, timeout=10, proxies=proxies)
xml_data = r.text
xml_data = xml_data.replace("&", "&amp;")
_encoding = xml_data.split('encoding="')[1].split('"')[0]
if '<?xml version="1.0" encoding="' in xml_data:
xml_data = xml_data.replace(
f'<?xml version="1.0" encoding="{_encoding}"?>',
'<?xml version="1.0" encoding="UTF-8"?>'
)
if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data:
xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}'
root = ET.fromstring(xml_data)
items = root.findall(".//item")
cached_items = []
new_items = []
for item in items:
guid = item.find("guid")
if guid is None or not guid.text:
continue
cache_key = f"rss:item:{guid.text}"
cached_item = rdb.get(cache_key)
title = item.find("title")
new_title = re.sub(r'&#\d+', '', title.text)
title.text = new_title
print(title.text)
if cached_item:
cached_items.append(cached_item.decode())
else:
description = item.find("description")
if description is not None:
new_guid = extract_viewtopic_link(description.text)
if new_guid:
print(f"Заменяю GUID: {guid.text}{new_guid}")
guid.attrib.clear()
guid.text = new_guid # 🔹 Теперь подмена делается ДО нормализации
# 🔹 Теперь нормализуем весь item ПОСЛЕ замены guid
normalize_xml_texts(item)
item_str = ET.tostring(item, encoding="unicode")
item_str = html.unescape(item_str)
rdb.setex(cache_key, CACHE_TTL, item_str)
new_items.append(item_str)
final_items = cached_items + new_items
response_xml = f"""{_head_html}{"".join(final_items)}</channel></rss>"""
return Response(response_xml, content_type="application/xml; charset=utf-8")
except Exception as e:
return f"Error: {e}", 500