From 30883c28361e6201023ab8005e0cc93f49b4b95e Mon Sep 17 00:00:00 2001 From: bacon Date: Wed, 5 Mar 2025 10:43:51 +0300 Subject: [PATCH] update rss_proxy.py * added check for `CharRef: invalid decimal value` --- proxy/rss_proxy.py | 45 ++++++++++++++++++++++++++++++++++++++------- requirements.txt | 2 +- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/proxy/rss_proxy.py b/proxy/rss_proxy.py index 754ab03..fc1fd55 100755 --- a/proxy/rss_proxy.py +++ b/proxy/rss_proxy.py @@ -4,6 +4,7 @@ import requests import redis import xml.etree.ElementTree as ET import re +import unicodedata from flask import request, Response PROXY_URL = os.getenv("PROXY_URL") @@ -13,12 +14,29 @@ CACHE_TTL = int(os.getenv("CACHE_TTL", 3600)) rdb = redis.from_url(REDIS_URL) +def normalize_text(text): + """Приводит текст к нормальному виду, устраняя странные символы.""" + if text: + return unicodedata.normalize("NFKC", text) + return text + + def extract_viewtopic_link(description): - """Search viewtopic.php in description""" - match = re.search(r'href="(http://tapochek\.net/viewtopic\.php\?t=\d+)"', description) + """Ищет любую ссылку в description""" + match = re.search(r'href="(https?://[^"]+)"', description) return match.group(1) if match else None +def normalize_xml_texts(elem): + """Применяет normalize_text ко всем текстовым узлам XML.""" + if elem.text: + elem.text = normalize_text(elem.text) + if elem.tail: + elem.tail = normalize_text(elem.tail) + for child in elem: + normalize_xml_texts(child) + + def init_proxy(app): @app.route("/proxy") def proxy(): @@ -32,10 +50,18 @@ def init_proxy(app): try: proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None r = requests.get(url, timeout=10, proxies=proxies) - _encode = r.apparent_encoding.lower() - r.encoding = _encode - xml_data = r.text.replace(f'', - '') + r.encoding = r.apparent_encoding + + xml_data = r.text + xml_data = xml_data.replace("&", "&") + if '', + '' + ) + + if '' not in xml_data: + xml_data = f'{xml_data}' root = ET.fromstring(xml_data) items = root.findall(".//item") @@ -45,7 +71,7 @@ def init_proxy(app): for item in items: guid = item.find("guid") - if guid is None or guid.get("isPermaLink") == "true": + if guid is None or not guid.text: continue cache_key = f"rss:item:{guid.text}" @@ -57,9 +83,14 @@ def init_proxy(app): description = item.find("description") if description is not None: new_guid = extract_viewtopic_link(description.text) + print(new_guid) if new_guid: + guid.attrib.clear() guid.text = new_guid + # Нормализуем весь + normalize_xml_texts(item) + item_str = ET.tostring(item, encoding="unicode") rdb.setex(cache_key, CACHE_TTL, item_str) new_items.append(item_str) diff --git a/requirements.txt b/requirements.txt index b76320d..a17295b 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ requests==2.32.3 Flask==3.1.0 loguru==0.7.3 redis==5.2.1 -gunicorn==23.0.0 +gunicorn==23.0.0 \ No newline at end of file