Enhancement: Replaced inline HTML tags with textual format for better…

… LLM context handling unclecode#24
cassgo · Jun 17, 2024 · 4135955 · 4135955
1 parent 42a5da8
commit 4135955
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [0.2.5] - 2024-06-17
+### Added
+- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
+
+
 ## [0.2.4] - 2024-06-17
 ### Fixed
 - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
diff --git a/README.md b/README.md
@@ -13,7 +13,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 
 ## Recent Changes 
 
-### v0.2.34
+### v0.2.5
+- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
+
+### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)
 
 ### v0.2.3

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -151,6 +151,38 @@ def handle_tag(self, tag, attrs, start):
 
         super().handle_tag(tag, attrs, start)
 
+def replace_inline_tags(soup, tags):
+    tag_replacements = {
+        'b': lambda tag: f"**{tag.text}**",
+        'i': lambda tag: f"*{tag.text}*",
+        'u': lambda tag: f"__{tag.text}__",
+        'span': lambda tag: f"{tag.text}",
+        'del': lambda tag: f"~~{tag.text}~~",
+        'ins': lambda tag: f"++{tag.text}++",
+        'sub': lambda tag: f"~{tag.text}~",
+        'sup': lambda tag: f"^^{tag.text}^^",
+        'strong': lambda tag: f"**{tag.text}**",
+        'em': lambda tag: f"*{tag.text}*",
+        'code': lambda tag: f"`{tag.text}`",
+        'kbd': lambda tag: f"`{tag.text}`",
+        'var': lambda tag: f"_{tag.text}_",
+        's': lambda tag: f"~~{tag.text}~~",
+        'q': lambda tag: f'"{tag.text}"',
+        'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
+        'cite': lambda tag: f"_{tag.text}_",
+        'dfn': lambda tag: f"_{tag.text}_",
+        'time': lambda tag: f"{tag.text}",
+        'small': lambda tag: f"<small>{tag.text}</small>",
+        'mark': lambda tag: f"=={tag.text}=="
+    }
+
+    for tag_name in tags:
+        for tag in soup.find_all(tag_name):
+            replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
+            tag.replace_with(replacement_text)
+
+    return soup
+
 def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
     try:
         if not html:
@@ -249,6 +281,9 @@ def replace_pre_tags_with_text(node):
 
         # Replace all "pre" tags with their inner text
         body = replace_pre_tags_with_text(body)
+
+        # Replace inline tags with their text content
+        body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])
 
         # Recursively remove empty elements, their parent elements, and elements with word count below threshold
         def remove_empty_and_low_word_count_elements(node, word_count_threshold):