Skip to content

Commit

Permalink
feat: Enhance crawler flexibility and LLM extraction capabilities
Browse files Browse the repository at this point in the history
- Add browser type selection (Chromium, Firefox, WebKit)
- Implement iframe content extraction
- Improve image processing and dimension updates
- Add custom headers support in AsyncPlaywrightCrawlerStrategy
- Enhance delayed content retrieval with new parameter
- Optimize HTML sanitization and Markdown conversion
- Update examples in quickstart_async.py for new features
  • Loading branch information
unclecode committed Oct 14, 2024
1 parent b9bbd42 commit 320afde
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 97 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -203,4 +203,5 @@ git_changes.py
git_changes.md
pypi_build.sh

.tests/
.tests/
git_changes.py
125 changes: 120 additions & 5 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs):
self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
self.proxy = kwargs.get("proxy")
self.headless = kwargs.get("headless", True)
self.headers = {}
self.browser_type = kwargs.get("browser_type", "chromium") # New parameter
self.headers = kwargs.get("headers", {})
self.sessions = {}
self.session_ttl = 1800
self.js_code = js_code
Expand Down Expand Up @@ -80,7 +81,6 @@ async def start(self):
if self.browser is None:
browser_args = {
"headless": self.headless,
# "headless": False,
"args": [
"--disable-gpu",
"--disable-dev-shm-usage",
Expand All @@ -95,7 +95,14 @@ async def start(self):
browser_args["proxy"] = proxy_settings


self.browser = await self.playwright.chromium.launch(**browser_args)
# Select the appropriate browser based on the browser_type
if self.browser_type == "firefox":
self.browser = await self.playwright.firefox.launch(**browser_args)
elif self.browser_type == "webkit":
self.browser = await self.playwright.webkit.launch(**browser_args)
else:
self.browser = await self.playwright.chromium.launch(**browser_args)

await self.execute_hook('on_browser_created', self.browser)

async def close(self):
Expand Down Expand Up @@ -145,7 +152,6 @@ def _cleanup_expired_sessions(self):
for sid in expired_sessions:
asyncio.create_task(self.kill_session(sid))


async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
wait_for = wait_for.strip()

Expand Down Expand Up @@ -209,6 +215,48 @@ async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout:
except Exception as e:
raise RuntimeError(f"Error in wait condition: {str(e)}")

async def process_iframes(self, page):
# Find all iframes
iframes = await page.query_selector_all('iframe')

for i, iframe in enumerate(iframes):
try:
# Add a unique identifier to the iframe
await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')

# Get the frame associated with this iframe
frame = await iframe.content_frame()

if frame:
# Wait for the frame to load
await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout

# Extract the content of the iframe's body
iframe_content = await frame.evaluate('() => document.body.innerHTML')

# Generate a unique class name for this iframe
class_name = f'extracted-iframe-content-{i}'

# Replace the iframe with a div containing the extracted content
_iframe = iframe_content.replace('`', '\\`')
await page.evaluate(f"""
() => {{
const iframe = document.getElementById('iframe-{i}');
const div = document.createElement('div');
div.innerHTML = `{_iframe}`;
div.className = '{class_name}';
iframe.replaceWith(div);
}}
""")
else:
print(f"Warning: Could not access content frame for iframe {i}")
except Exception as e:
print(f"Error processing iframe {i}: {str(e)}")

# Return the page object
return page


async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
response_headers = {}
status_code = None
Expand Down Expand Up @@ -263,6 +311,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
status_code = 200
response_headers = {}


await page.wait_for_selector('body')
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

Expand Down Expand Up @@ -305,11 +354,78 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
if kwargs.get("screenshot"):
screenshot_data = await self.take_screenshot(url)


# New code to update image dimensions
update_image_dimensions_js = """
() => {
return new Promise((resolve) => {
const filterImage = (img) => {
// Filter out images that are too small
if (img.width < 100 && img.height < 100) return false;
// Filter out images that are not visible
const rect = img.getBoundingClientRect();
if (rect.width === 0 || rect.height === 0) return false;
// Filter out images with certain class names (e.g., icons, thumbnails)
if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
// Filter out images with certain patterns in their src (e.g., placeholder images)
if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
return true;
};
const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
let imagesLeft = images.length;
if (imagesLeft === 0) {
resolve();
return;
}
const checkImage = (img) => {
if (img.complete && img.naturalWidth !== 0) {
img.setAttribute('width', img.naturalWidth);
img.setAttribute('height', img.naturalHeight);
imagesLeft--;
if (imagesLeft === 0) resolve();
}
};
images.forEach(img => {
checkImage(img);
if (!img.complete) {
img.onload = () => {
checkImage(img);
};
img.onerror = () => {
imagesLeft--;
if (imagesLeft === 0) resolve();
};
}
});
// Fallback timeout of 5 seconds
setTimeout(() => resolve(), 5000);
});
}
"""
await page.evaluate(update_image_dimensions_js)

# Wait a bit for any onload events to complete
await page.wait_for_timeout(100)

# Process iframes
if kwargs.get("process_iframes", False):
page = await self.process_iframes(page)

await self.execute_hook('before_retrieve_html', page)
# Check if delay_before_return_html is set then wait for that time
delay_before_return_html = kwargs.get("delay_before_return_html")
if delay_before_return_html:
await asyncio.sleep(delay_before_return_html)

html = await page.content()
await self.execute_hook('before_return_html', page, html)

Expand Down Expand Up @@ -398,7 +514,6 @@ async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = Non
except Error as e:
raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")


async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
semaphore = asyncio.Semaphore(semaphore_count)
Expand Down
13 changes: 8 additions & 5 deletions crawl4ai/content_scrapping_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
CustomHTML2Text
)



class ContentScrappingStrategy(ABC):
@abstractmethod
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
Expand Down Expand Up @@ -129,7 +127,7 @@ def fetch_image_file_size(img, base_url):
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
image_format = os.path.splitext(img.get('src',''))[1].lower()
# Remove . from format
image_format = image_format.strip('.')
image_format = image_format.strip('.').split('?')[0]
score = 0
if height_value:
if height_unit == 'px' and height_value > 150:
Expand Down Expand Up @@ -158,6 +156,7 @@ def fetch_image_file_size(img, base_url):
return None
return {
'src': img.get('src', ''),
'data-src': img.get('data-src', ''),
'alt': img.get('alt', ''),
'desc': find_closest_parent_with_useful_text(img),
'score': score,
Expand Down Expand Up @@ -275,11 +274,14 @@ def flatten_nested_elements(node):
# Replace base64 data with empty string
img['src'] = base64_pattern.sub('', src)
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = sanitize_html(cleaned_html)

h = CustomHTML2Text()
h.ignore_links = True
markdown = h.handle(cleaned_html)
h.body_width = 0
try:
markdown = h.handle(cleaned_html)
except Exception as e:
markdown = h.handle(sanitize_html(cleaned_html))
markdown = markdown.replace(' ```', '```')

try:
Expand All @@ -288,6 +290,7 @@ def flatten_nested_elements(node):
print('Error extracting metadata:', str(e))
meta = {}

cleaned_html = sanitize_html(cleaned_html)
return {
'markdown': markdown,
'cleaned_html': cleaned_html,
Expand Down
4 changes: 2 additions & 2 deletions crawl4ai/prompts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
<url>{URL}</url>
And here is the cleaned HTML content of that webpage:
Expand Down Expand Up @@ -79,7 +79,7 @@
2. For each block:
a. Assign it an index based on its order in the content.
b. Analyze the content and generate ONE semantic tag that describe what the block is about.
c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
Expand Down
Loading

0 comments on commit 320afde

Please sign in to comment.