phase 17: fix HTML parser NoneType attrs crash in boilerplate stripping

This commit is contained in:
Celes Renata
2026-04-12 03:03:07 -07:00
parent 264b83ea56
commit e4a1d2d69a
+6 -3
View File
@@ -148,6 +148,8 @@ class ParsedDocument:
def _attr_str(tag: Tag, attr: str) -> str:
"""Safely get a tag attribute as a joined string."""
if tag.attrs is None:
return ""
val = tag.get(attr, "")
if isinstance(val, list):
return " ".join(val)
@@ -168,9 +170,10 @@ def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
for tag in soup.find_all(tag_name):
tag.decompose()
for tag in soup.find_all(True):
if _is_boilerplate_container(tag):
tag.decompose()
# Collect boilerplate tags first, then decompose to avoid iterator issues
to_remove = [tag for tag in soup.find_all(True) if tag.attrs is not None and _is_boilerplate_container(tag)]
for tag in to_remove:
tag.decompose()
def _reduce_boilerplate_text(text: str) -> str: