update rss_proxy.py

* added check for `CharRef: invalid decimal value`
This commit is contained in:
2025-03-05 10:43:51 +03:00
parent e2e4061f44
commit 30883c2836
2 changed files with 39 additions and 8 deletions

View File

@@ -4,6 +4,7 @@ import requests
import redis import redis
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import re import re
import unicodedata
from flask import request, Response from flask import request, Response
PROXY_URL = os.getenv("PROXY_URL") PROXY_URL = os.getenv("PROXY_URL")
@@ -13,12 +14,29 @@ CACHE_TTL = int(os.getenv("CACHE_TTL", 3600))
rdb = redis.from_url(REDIS_URL) rdb = redis.from_url(REDIS_URL)
def normalize_text(text):
"""Приводит текст к нормальному виду, устраняя странные символы."""
if text:
return unicodedata.normalize("NFKC", text)
return text
def extract_viewtopic_link(description): def extract_viewtopic_link(description):
"""Search viewtopic.php in description""" """Ищет любую ссылку в description"""
match = re.search(r'href="(http://tapochek\.net/viewtopic\.php\?t=\d+)"', description) match = re.search(r'href="(https?://[^"]+)"', description)
return match.group(1) if match else None return match.group(1) if match else None
def normalize_xml_texts(elem):
"""Применяет normalize_text ко всем текстовым узлам XML."""
if elem.text:
elem.text = normalize_text(elem.text)
if elem.tail:
elem.tail = normalize_text(elem.tail)
for child in elem:
normalize_xml_texts(child)
def init_proxy(app): def init_proxy(app):
@app.route("/proxy") @app.route("/proxy")
def proxy(): def proxy():
@@ -32,10 +50,18 @@ def init_proxy(app):
try: try:
proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None proxies = {"http": PROXY_URL, "https": PROXY_URL} if PROXY_URL else None
r = requests.get(url, timeout=10, proxies=proxies) r = requests.get(url, timeout=10, proxies=proxies)
_encode = r.apparent_encoding.lower() r.encoding = r.apparent_encoding
r.encoding = _encode
xml_data = r.text.replace(f'<?xml version="1.0" encoding="{_encode}"?>', xml_data = r.text
'<?xml version="1.0" encoding="UTF-8"?>') xml_data = xml_data.replace("&", "&amp;")
if '<?xml version="1.0" encoding="' in xml_data:
xml_data = xml_data.replace(
'<?xml version="1.0" encoding="' + xml_data.split('encoding="')[1].split('"')[0] + '"?>',
'<?xml version="1.0" encoding="UTF-8"?>'
)
if '<?xml version="1.0" encoding="UTF-8"?>' not in xml_data:
xml_data = f'<?xml version="1.0" encoding="UTF-8"?>{xml_data}'
root = ET.fromstring(xml_data) root = ET.fromstring(xml_data)
items = root.findall(".//item") items = root.findall(".//item")
@@ -45,7 +71,7 @@ def init_proxy(app):
for item in items: for item in items:
guid = item.find("guid") guid = item.find("guid")
if guid is None or guid.get("isPermaLink") == "true": if guid is None or not guid.text:
continue continue
cache_key = f"rss:item:{guid.text}" cache_key = f"rss:item:{guid.text}"
@@ -57,9 +83,14 @@ def init_proxy(app):
description = item.find("description") description = item.find("description")
if description is not None: if description is not None:
new_guid = extract_viewtopic_link(description.text) new_guid = extract_viewtopic_link(description.text)
print(new_guid)
if new_guid: if new_guid:
guid.attrib.clear()
guid.text = new_guid guid.text = new_guid
# Нормализуем весь <item>
normalize_xml_texts(item)
item_str = ET.tostring(item, encoding="unicode") item_str = ET.tostring(item, encoding="unicode")
rdb.setex(cache_key, CACHE_TTL, item_str) rdb.setex(cache_key, CACHE_TTL, item_str)
new_items.append(item_str) new_items.append(item_str)

View File

@@ -2,4 +2,4 @@ requests==2.32.3
Flask==3.1.0 Flask==3.1.0
loguru==0.7.3 loguru==0.7.3
redis==5.2.1 redis==5.2.1
gunicorn==23.0.0 gunicorn==23.0.0