From 30883c28361e6201023ab8005e0cc93f49b4b95e Mon Sep 17 00:00:00 2001
From: bacon <rik.slava@gmail.com>
Date: Wed, 5 Mar 2025 10:43:51 +0300
Subject: [PATCH] update rss_proxy.py * added check for `CharRef: invalid
 decimal value`

---
 proxy/rss_proxy.py | 45 ++++++++++++++++++++++++++++++++++++++-------
 requirements.txt   |  2 +-
 2 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/proxy/rss_proxy.py b/proxy/rss_proxy.py
index 754ab03..fc1fd55 100755
--- a/proxy/rss_proxy.py
+++ b/proxy/rss_proxy.py
@@ -4,6 +4,7 @@ import requests
 import redis
 import xml.etree.ElementTree as ET
 import re
+import unicodedata
 from flask import request, Response
 
 PROXY_URL = os.getenv("PROXY_URL")
@@ -13,12 +14,29 @@ CACHE_TTL = int(os.getenv("CACHE_TTL", 3600))
 rdb = redis.from_url(REDIS_URL)
 
 
+def normalize_text(text):
+    """Приводит текст к нормальному виду, устраняя странные символы."""
+    if text:
+        return unicodedata.normalize("NFKC", text)
+    return text
+
+
 def extract_viewtopic_link(description):
-    """Search viewtopic.php in description"""
-    match = re.search(r'href="(http://tapochek\.net/viewtopic\.php\?t=\d+)"', description)
+    """Ищет любую ссылку в description"""
+    match = re.search(r'href="(https?://[^"]+)"', description)
     return match.group(1) if match else None
 
 
+def normalize_xml_texts(elem):
+    """Применяет normalize_text ко всем текстовым узлам XML."""
+    if elem.text:
+        elem.text = normalize_text(elem.text)
+    if elem.tail:
+        elem.tail = normalize_text(elem.tail)
+    for child in elem:
+        normalize_xml_texts(child)
+
+
 def init_proxy(app):
     @app.route("/proxy")
     def proxy():
@@ -32,10 +50,18 @@ def init_proxy(app):
         try:
             proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None
             r = requests.get(url, timeout=10, proxies=proxies)
-            _encode = r.apparent_encoding.lower()
-            r.encoding = _encode
-            xml_data = r.text.replace(f'<?xml version="1.0" encoding="{_encode}"?>',
-                                      '<?xml version="1.0" encoding="UTF-8"?>')
+            r.encoding = r.apparent_encoding
+
+            xml_data = r.text
+            xml_data = xml_data.replace("&", "&amp;")
+            if '<?xml version="1.0" encoding="' in xml_data:
+                xml_data = xml_data.replace(
+                    '<?xml version="1.0" encoding="' + xml_data.split('encoding="')[1].split('"')[0] + '"?>',
+                    '<?xml version="1.0" encoding="UTF-8"?>'
+                )
+
+            if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data:
+                xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}'
 
             root = ET.fromstring(xml_data)
             items = root.findall(".//item")
@@ -45,7 +71,7 @@ def init_proxy(app):
 
             for item in items:
                 guid = item.find("guid")
-                if guid is None or guid.get("isPermaLink") == "true":
+                if guid is None or not guid.text:
                     continue
 
                 cache_key = f"rss:item:{guid.text}"
@@ -57,9 +83,14 @@ def init_proxy(app):
                     description = item.find("description")
                     if description is not None:
                         new_guid = extract_viewtopic_link(description.text)
+                        print(new_guid)
                         if new_guid:
+                            guid.attrib.clear()
                             guid.text = new_guid
 
+                    # Нормализуем весь <item>
+                    normalize_xml_texts(item)
+
                     item_str = ET.tostring(item, encoding="unicode")
                     rdb.setex(cache_key, CACHE_TTL, item_str)
                     new_items.append(item_str)
diff --git a/requirements.txt b/requirements.txt
index b76320d..a17295b 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,4 @@ requests==2.32.3
 Flask==3.1.0
 loguru==0.7.3
 redis==5.2.1
-gunicorn==23.0.0
+gunicorn==23.0.0
\ No newline at end of file