Update rss_proxy.py

* remove guid rewrite
2025-03-10 07:33:38 +00:00
parent 99133daeed
commit a3dff4a339
1 changed files with 106 additions and 116 deletions
--- a/proxy/rss_proxy.py
+++ b/proxy/rss_proxy.py
@@ -1,116 +1,106 @@
-import html
+import html
-import urllib.parse
+import urllib.parse
-import os
+import os
-import requests
+import requests
-import redis
+import redis
-import xml.etree.ElementTree as ET
+import xml.etree.ElementTree as ET
-import re
+import re
-import unicodedata
+import unicodedata
-from flask import request, Response
+from flask import request, Response
-
+
-PROXY_URL = os.getenv("PROXY_URL")
+PROXY_URL = os.getenv("PROXY_URL")
-REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
+REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
-CACHE_TTL = int(os.getenv("CACHE_TTL", 3600))
+CACHE_TTL = int(os.getenv("CACHE_TTL", 3600))
-
+
-rdb = redis.from_url(REDIS_URL)
+rdb = redis.from_url(REDIS_URL)
-
+
-_head_html = f"""<?xml version="1.0" encoding="UTF-8"?>
+_head_html = f"""<?xml version="1.0" encoding="UTF-8"?>
-<rss version="2.0">
+<rss version="2.0">
-<channel><title>Tapochek.net RSS</title>
+<channel><title>Tapochek.net RSS</title>
-<link>http://tapochek.net/</link>
+<link>http://tapochek.net/</link>
-<ttl>15</ttl>"""
+<ttl>15</ttl>"""
-
+
-
+
-def normalize_text(text):
+def normalize_text(text):
-    """Приводит текст к нормальному виду, устраняя странные символы."""
+    """Приводит текст к нормальному виду, устраняя странные символы."""
-    if text:
+    if text:
-        return unicodedata.normalize("NFKC", text)
+        return unicodedata.normalize("NFKC", text)
-    return text
+    return text
-
+
-
+
-def extract_viewtopic_link(description):
+def extract_viewtopic_link(description):
-    decoded_description = html.unescape(description)
+    decoded_description = html.unescape(description)
-    match = re.search(r'href="(https?://[^"]+)"', decoded_description)
+    match = re.search(r'href="(https?://[^"]+)"', decoded_description)
-    return match.group(1) if match else None
+    return match.group(1) if match else None
-
+
-
+
-def normalize_xml_texts(elem):
+def normalize_xml_texts(elem):
-    """Применяет normalize_text ко всем текстовым узлам XML."""
+    """Применяет normalize_text ко всем текстовым узлам XML."""
-    if elem.text:
+    if elem.text:
-        elem.text = normalize_text(elem.text)
+        elem.text = normalize_text(elem.text)
-    if elem.tail:
+    if elem.tail:
-        elem.tail = normalize_text(elem.tail)
+        elem.tail = normalize_text(elem.tail)
-    for child in elem:
+    for child in elem:
-        normalize_xml_texts(child)
+        normalize_xml_texts(child)
-
+
-
+
-def init_proxy(app):
+def init_proxy(app):
-    @app.route("/proxy")
+    @app.route("/proxy")
-    def proxy():
+    def proxy():
-        """Proxy RSS feed with per-item caching and GUID replacement."""
+        """Proxy RSS feed with per-item caching and GUID replacement."""
-        raw_query = request.query_string.decode()
+        raw_query = request.query_string.decode()
-        if raw_query.startswith("url="):
+        if raw_query.startswith("url="):
-            url = urllib.parse.unquote(raw_query[4:])
+            url = urllib.parse.unquote(raw_query[4:])
-            url = html.unescape(url)
+            url = html.unescape(url)
-        else:
+        else:
-            return "Missing URL", 400
+            return "Missing URL", 400
-
+
-        try:
+        try:
-            proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None
+            proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None
-            r = requests.get(url, timeout=10, proxies=proxies)
+            r = requests.get(url, timeout=10, proxies=proxies)
-
+
-            xml_data = r.text
+            xml_data = r.text
-            xml_data = xml_data.replace("&", "&amp;")
+            xml_data = xml_data.replace("&", "&amp;")
-            _encoding = xml_data.split('encoding="')[1].split('"')[0]
+            _encoding = xml_data.split('encoding="')[1].split('"')[0]
-            if '<?xml version="1.0" encoding="' in xml_data:
+            if '<?xml version="1.0" encoding="' in xml_data:
-                xml_data = xml_data.replace(
+                xml_data = xml_data.replace(
-                    f'<?xml version="1.0" encoding="{_encoding}"?>',
+                    f'<?xml version="1.0" encoding="{_encoding}"?>',
-                    '<?xml version="1.0" encoding="UTF-8"?>'
+                    '<?xml version="1.0" encoding="UTF-8"?>'
-                )
+                )
-
+
-            if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data:
+            if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data:
-                xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}'
+                xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}'
-
+
-            root = ET.fromstring(xml_data)
+            root = ET.fromstring(xml_data)
-            items = root.findall(".//item")
+            items = root.findall(".//item")
-
+
-            cached_items = []
+            cached_items = []
-            new_items = []
+            new_items = []
-
+
-            for item in items:
+            for item in items:
-                guid = item.find("guid")
+                cache_key = f"rss:item:{guid.text}"
-                if guid is None or not guid.text:
+                cached_item = rdb.get(cache_key)
-                    continue
+
-
+                title = item.find("title")
-                cache_key = f"rss:item:{guid.text}"
+                new_title = re.sub(r'&#\d+', '', title.text)
-                cached_item = rdb.get(cache_key)
+                title.text = new_title
-
+
-                title = item.find("title")
+                if cached_item:
-                new_title = re.sub(r'&#\d+', '', title.text)
+                    cached_items.append(cached_item.decode())
-                title.text = new_title
+                else:
-
+                    description = item.find("description")
-                if cached_item:
+
-                    cached_items.append(cached_item.decode())
+                    normalize_xml_texts(item)
-                else:
+
-                    description = item.find("description")
+                    item_str = ET.tostring(item, encoding="unicode")
-                    if description is not None:
+                    item_str = html.unescape(item_str)
-                        new_guid = extract_viewtopic_link(description.text)
+                    rdb.setex(cache_key, CACHE_TTL, item_str)
-                        if new_guid:
+                    new_items.append(item_str)
-                            print(f"Заменяю GUID: {guid.text} → {new_guid}")
+
-                            guid.attrib.clear()
+            final_items = cached_items + new_items
-                            guid.text = new_guid
+            response_xml = f"""{_head_html}{"".join(final_items)}</channel></rss>"""
-
+
-                    normalize_xml_texts(item)
+            return Response(response_xml, content_type="application/xml; charset=utf-8")
-
+
-                    item_str = ET.tostring(item, encoding="unicode")
+        except Exception as e:
-                    item_str = html.unescape(item_str)
+            return f"Error: {e}", 500
                    rdb.setex(cache_key, CACHE_TTL, item_str)
                    new_items.append(item_str)
            final_items = cached_items + new_items
            response_xml = f"""{_head_html}{"".join(final_items)}</channel></rss>"""
            return Response(response_xml, content_type="application/xml; charset=utf-8")
        except Exception as e:
            return f"Error: {e}", 500