diff --git a/services/parser/html_parser.py b/services/parser/html_parser.py
index 1b96760..d84e34c 100644
--- a/services/parser/html_parser.py
+++ b/services/parser/html_parser.py
@@ -148,6 +148,8 @@ class ParsedDocument:
def _attr_str(tag: Tag, attr: str) -> str:
"""Safely get a tag attribute as a joined string."""
+ if tag.attrs is None:
+ return ""
val = tag.get(attr, "")
if isinstance(val, list):
return " ".join(val)
@@ -168,9 +170,10 @@ def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
for tag in soup.find_all(tag_name):
tag.decompose()
- for tag in soup.find_all(True):
- if _is_boilerplate_container(tag):
- tag.decompose()
+ # Collect boilerplate tags first, then decompose to avoid iterator issues
+ to_remove = [tag for tag in soup.find_all(True) if tag.attrs is not None and _is_boilerplate_container(tag)]
+ for tag in to_remove:
+ tag.decompose()
def _reduce_boilerplate_text(text: str) -> str: