Skip to content

Commit

Permalink
Enhancement: Replaced inline HTML tags with textual format for better…
Browse files Browse the repository at this point in the history
… LLM context handling unclecode#24
  • Loading branch information
unclecode committed Jun 17, 2024
1 parent 42a5da8 commit 4135955
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 1 deletion.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## [0.2.5] - 2024-06-17
### Added
- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.


## [0.2.4] - 2024-06-17
### Fixed
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

## Recent Changes

### v0.2.34
### v0.2.5
- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.

### v0.2.4
- 🐞 Resolve the issue with the long url. (Issue #22)

### v0.2.3
Expand Down
35 changes: 35 additions & 0 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,38 @@ def handle_tag(self, tag, attrs, start):

super().handle_tag(tag, attrs, start)

def replace_inline_tags(soup, tags):
tag_replacements = {
'b': lambda tag: f"**{tag.text}**",
'i': lambda tag: f"*{tag.text}*",
'u': lambda tag: f"__{tag.text}__",
'span': lambda tag: f"{tag.text}",
'del': lambda tag: f"~~{tag.text}~~",
'ins': lambda tag: f"++{tag.text}++",
'sub': lambda tag: f"~{tag.text}~",
'sup': lambda tag: f"^^{tag.text}^^",
'strong': lambda tag: f"**{tag.text}**",
'em': lambda tag: f"*{tag.text}*",
'code': lambda tag: f"`{tag.text}`",
'kbd': lambda tag: f"`{tag.text}`",
'var': lambda tag: f"_{tag.text}_",
's': lambda tag: f"~~{tag.text}~~",
'q': lambda tag: f'"{tag.text}"',
'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
'cite': lambda tag: f"_{tag.text}_",
'dfn': lambda tag: f"_{tag.text}_",
'time': lambda tag: f"{tag.text}",
'small': lambda tag: f"<small>{tag.text}</small>",
'mark': lambda tag: f"=={tag.text}=="
}

for tag_name in tags:
for tag in soup.find_all(tag_name):
replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
tag.replace_with(replacement_text)

return soup

def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
try:
if not html:
Expand Down Expand Up @@ -249,6 +281,9 @@ def replace_pre_tags_with_text(node):

# Replace all "pre" tags with their inner text
body = replace_pre_tags_with_text(body)

# Replace inline tags with their text content
body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])

# Recursively remove empty elements, their parent elements, and elements with word count below threshold
def remove_empty_and_low_word_count_elements(node, word_count_threshold):
Expand Down

0 comments on commit 4135955

Please sign in to comment.