phase 17: fix HTML parser NoneType attrs crash in boilerplate stripping

This commit is contained in:
Celes Renata
2026-04-12 03:03:07 -07:00
parent 264b83ea56
commit e4a1d2d69a
+6 -3
View File
@@ -148,6 +148,8 @@ class ParsedDocument:
def _attr_str(tag: Tag, attr: str) -> str: def _attr_str(tag: Tag, attr: str) -> str:
"""Safely get a tag attribute as a joined string.""" """Safely get a tag attribute as a joined string."""
if tag.attrs is None:
return ""
val = tag.get(attr, "") val = tag.get(attr, "")
if isinstance(val, list): if isinstance(val, list):
return " ".join(val) return " ".join(val)
@@ -168,9 +170,10 @@ def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
for tag in soup.find_all(tag_name): for tag in soup.find_all(tag_name):
tag.decompose() tag.decompose()
for tag in soup.find_all(True): # Collect boilerplate tags first, then decompose to avoid iterator issues
if _is_boilerplate_container(tag): to_remove = [tag for tag in soup.find_all(True) if tag.attrs is not None and _is_boilerplate_container(tag)]
tag.decompose() for tag in to_remove:
tag.decompose()
def _reduce_boilerplate_text(text: str) -> str: def _reduce_boilerplate_text(text: str) -> str: