From e4a1d2d69af4621c1d5f2cc38e795a5cc6e5d120 Mon Sep 17 00:00:00 2001 From: Celes Renata Date: Sun, 12 Apr 2026 03:03:07 -0700 Subject: [PATCH] phase 17: fix HTML parser NoneType attrs crash in boilerplate stripping --- services/parser/html_parser.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/services/parser/html_parser.py b/services/parser/html_parser.py index 1b96760..d84e34c 100644 --- a/services/parser/html_parser.py +++ b/services/parser/html_parser.py @@ -148,6 +148,8 @@ class ParsedDocument: def _attr_str(tag: Tag, attr: str) -> str: """Safely get a tag attribute as a joined string.""" + if tag.attrs is None: + return "" val = tag.get(attr, "") if isinstance(val, list): return " ".join(val) @@ -168,9 +170,10 @@ def _strip_boilerplate_tags(soup: BeautifulSoup) -> None: for tag in soup.find_all(tag_name): tag.decompose() - for tag in soup.find_all(True): - if _is_boilerplate_container(tag): - tag.decompose() + # Collect boilerplate tags first, then decompose to avoid iterator issues + to_remove = [tag for tag in soup.find_all(True) if tag.attrs is not None and _is_boilerplate_container(tag)] + for tag in to_remove: + tag.decompose() def _reduce_boilerplate_text(text: str) -> str: