update rss_proxy.py

* added check for `CharRef: invalid decimal value`
This commit is contained in:
2025-03-05 10:43:51 +03:00
parent e2e4061f44
commit 30883c2836
2 changed files with 39 additions and 8 deletions

View File

@@ -4,6 +4,7 @@ import requests
import redis
import xml.etree.ElementTree as ET
import re
import unicodedata
from flask import request, Response
PROXY_URL = os.getenv("PROXY_URL")
@@ -13,12 +14,29 @@ CACHE_TTL = int(os.getenv("CACHE_TTL", 3600))
rdb = redis.from_url(REDIS_URL)
def normalize_text(text):
"""Приводит текст к нормальному виду, устраняя странные символы."""
if text:
return unicodedata.normalize("NFKC", text)
return text
def extract_viewtopic_link(description):
"""Search viewtopic.php in description"""
match = re.search(r'href="(http://tapochek\.net/viewtopic\.php\?t=\d+)"', description)
"""Ищет любую ссылку в description"""
match = re.search(r'href="(https?://[^"]+)"', description)
return match.group(1) if match else None
def normalize_xml_texts(elem):
"""Применяет normalize_text ко всем текстовым узлам XML."""
if elem.text:
elem.text = normalize_text(elem.text)
if elem.tail:
elem.tail = normalize_text(elem.tail)
for child in elem:
normalize_xml_texts(child)
def init_proxy(app):
@app.route("/proxy")
def proxy():
@@ -32,10 +50,18 @@ def init_proxy(app):
try:
proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None
r = requests.get(url, timeout=10, proxies=proxies)
_encode = r.apparent_encoding.lower()
r.encoding = _encode
xml_data = r.text.replace(f'<?xml version="1.0" encoding="{_encode}"?>',
'<?xml version="1.0" encoding="UTF-8"?>')
r.encoding = r.apparent_encoding
xml_data = r.text
xml_data = xml_data.replace("&", "&amp;")
if '<?xml version="1.0" encoding="' in xml_data:
xml_data = xml_data.replace(
'<?xml version="1.0" encoding="' + xml_data.split('encoding="')[1].split('"')[0] + '"?>',
'<?xml version="1.0" encoding="UTF-8"?>'
)
if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data:
xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}'
root = ET.fromstring(xml_data)
items = root.findall(".//item")
@@ -45,7 +71,7 @@ def init_proxy(app):
for item in items:
guid = item.find("guid")
if guid is None or guid.get("isPermaLink") == "true":
if guid is None or not guid.text:
continue
cache_key = f"rss:item:{guid.text}"
@@ -57,9 +83,14 @@ def init_proxy(app):
description = item.find("description")
if description is not None:
new_guid = extract_viewtopic_link(description.text)
print(new_guid)
if new_guid:
guid.attrib.clear()
guid.text = new_guid
# Нормализуем весь <item>
normalize_xml_texts(item)
item_str = ET.tostring(item, encoding="unicode")
rdb.setex(cache_key, CACHE_TTL, item_str)
new_items.append(item_str)

View File

@@ -2,4 +2,4 @@ requests==2.32.3
Flask==3.1.0
loguru==0.7.3
redis==5.2.1
gunicorn==23.0.0
gunicorn==23.0.0