phase 17: fix HTML parser NoneType attrs crash in boilerplate stripping
This commit is contained in:
@@ -148,6 +148,8 @@ class ParsedDocument:
|
|||||||
|
|
||||||
def _attr_str(tag: Tag, attr: str) -> str:
|
def _attr_str(tag: Tag, attr: str) -> str:
|
||||||
"""Safely get a tag attribute as a joined string."""
|
"""Safely get a tag attribute as a joined string."""
|
||||||
|
if tag.attrs is None:
|
||||||
|
return ""
|
||||||
val = tag.get(attr, "")
|
val = tag.get(attr, "")
|
||||||
if isinstance(val, list):
|
if isinstance(val, list):
|
||||||
return " ".join(val)
|
return " ".join(val)
|
||||||
@@ -168,9 +170,10 @@ def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
|
|||||||
for tag in soup.find_all(tag_name):
|
for tag in soup.find_all(tag_name):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
|
||||||
for tag in soup.find_all(True):
|
# Collect boilerplate tags first, then decompose to avoid iterator issues
|
||||||
if _is_boilerplate_container(tag):
|
to_remove = [tag for tag in soup.find_all(True) if tag.attrs is not None and _is_boilerplate_container(tag)]
|
||||||
tag.decompose()
|
for tag in to_remove:
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
|
||||||
def _reduce_boilerplate_text(text: str) -> str:
|
def _reduce_boilerplate_text(text: str) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user