phase 17: fix HTML parser NoneType attrs crash in boilerplate stripping
This commit is contained in:
@@ -148,6 +148,8 @@ class ParsedDocument:
|
||||
|
||||
def _attr_str(tag: Tag, attr: str) -> str:
|
||||
"""Safely get a tag attribute as a joined string."""
|
||||
if tag.attrs is None:
|
||||
return ""
|
||||
val = tag.get(attr, "")
|
||||
if isinstance(val, list):
|
||||
return " ".join(val)
|
||||
@@ -168,8 +170,9 @@ def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
|
||||
for tag in soup.find_all(tag_name):
|
||||
tag.decompose()
|
||||
|
||||
for tag in soup.find_all(True):
|
||||
if _is_boilerplate_container(tag):
|
||||
# Collect boilerplate tags first, then decompose to avoid iterator issues
|
||||
to_remove = [tag for tag in soup.find_all(True) if tag.attrs is not None and _is_boilerplate_container(tag)]
|
||||
for tag in to_remove:
|
||||
tag.decompose()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user