diff --git a/services/parser/html_parser.py b/services/parser/html_parser.py index 1b96760..d84e34c 100644 --- a/services/parser/html_parser.py +++ b/services/parser/html_parser.py @@ -148,6 +148,8 @@ class ParsedDocument: def _attr_str(tag: Tag, attr: str) -> str: """Safely get a tag attribute as a joined string.""" + if tag.attrs is None: + return "" val = tag.get(attr, "") if isinstance(val, list): return " ".join(val) @@ -168,9 +170,10 @@ def _strip_boilerplate_tags(soup: BeautifulSoup) -> None: for tag in soup.find_all(tag_name): tag.decompose() - for tag in soup.find_all(True): - if _is_boilerplate_container(tag): - tag.decompose() + # Collect boilerplate tags first, then decompose to avoid iterator issues + to_remove = [tag for tag in soup.find_all(True) if tag.attrs is not None and _is_boilerplate_container(tag)] + for tag in to_remove: + tag.decompose() def _reduce_boilerplate_text(text: str) -> str: