feat: Enhance crawler flexibility and LLM extraction capabilities

- Add browser type selection (Chromium, Firefox, WebKit) - Implement iframe content extraction - Improve image processing and dimension updates - Add custom headers support in AsyncPlaywrightCrawlerStrategy - Enhance delayed content retrieval with new parameter - Optimize HTML sanitization and Markdown conversion - Update examples in quickstart_async.py for new features
wanghaisheng · Oct 14, 2024 · 320afde · 320afde
1 parent b9bbd42
commit 320afde
Show file tree

Hide file tree

Showing 7 changed files with 242 additions and 97 deletions.
diff --git a/.gitignore b/.gitignore
@@ -203,4 +203,5 @@ git_changes.py
 git_changes.md
 pypi_build.sh
 
-.tests/
+.tests/
+git_changes.py
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -50,7 +50,8 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs):
         self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
         self.proxy = kwargs.get("proxy")
         self.headless = kwargs.get("headless", True)
-        self.headers = {}
+        self.browser_type = kwargs.get("browser_type", "chromium")  # New parameter
+        self.headers = kwargs.get("headers", {})
         self.sessions = {}
         self.session_ttl = 1800 
         self.js_code = js_code
@@ -80,7 +81,6 @@ async def start(self):
         if self.browser is None:
             browser_args = {
                 "headless": self.headless,
-                # "headless": False,
                 "args": [
                     "--disable-gpu",
                     "--disable-dev-shm-usage",
@@ -95,7 +95,14 @@ async def start(self):
                 browser_args["proxy"] = proxy_settings
 
 
-            self.browser = await self.playwright.chromium.launch(**browser_args)
+            # Select the appropriate browser based on the browser_type
+            if self.browser_type == "firefox":
+                self.browser = await self.playwright.firefox.launch(**browser_args)
+            elif self.browser_type == "webkit":
+                self.browser = await self.playwright.webkit.launch(**browser_args)
+            else:
+                self.browser = await self.playwright.chromium.launch(**browser_args)
+
             await self.execute_hook('on_browser_created', self.browser)
 
     async def close(self):
@@ -145,7 +152,6 @@ def _cleanup_expired_sessions(self):
         for sid in expired_sessions:
             asyncio.create_task(self.kill_session(sid))
 
-
     async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
         wait_for = wait_for.strip()
 
@@ -209,6 +215,48 @@ async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout:
         except Exception as e:
             raise RuntimeError(f"Error in wait condition: {str(e)}")
 
+    async def process_iframes(self, page):
+        # Find all iframes
+        iframes = await page.query_selector_all('iframe')
+
+        for i, iframe in enumerate(iframes):
+            try:
+                # Add a unique identifier to the iframe
+                await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
+
+                # Get the frame associated with this iframe
+                frame = await iframe.content_frame()
+
+                if frame:
+                    # Wait for the frame to load
+                    await frame.wait_for_load_state('load', timeout=30000)  # 30 seconds timeout
+
+                    # Extract the content of the iframe's body
+                    iframe_content = await frame.evaluate('() => document.body.innerHTML')
+
+                    # Generate a unique class name for this iframe
+                    class_name = f'extracted-iframe-content-{i}'
+
+                    # Replace the iframe with a div containing the extracted content
+                    _iframe = iframe_content.replace('`', '\\`')
+                    await page.evaluate(f"""
+                        () => {{
+                            const iframe = document.getElementById('iframe-{i}');
+                            const div = document.createElement('div');
+                            div.innerHTML = `{_iframe}`;
+                            div.className = '{class_name}';
+                            iframe.replaceWith(div);
+                        }}
+                    """)
+                else:
+                    print(f"Warning: Could not access content frame for iframe {i}")
+            except Exception as e:
+                print(f"Error processing iframe {i}: {str(e)}")
+
+        # Return the page object
+        return page
+
+
     async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
         response_headers = {}
         status_code = None
@@ -263,6 +311,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
                 status_code = 200
                 response_headers = {}
 
+
             await page.wait_for_selector('body')
             await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
 
@@ -305,11 +354,78 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
             if kwargs.get("screenshot"):
                 screenshot_data = await self.take_screenshot(url)
 
+
+            # New code to update image dimensions
+            update_image_dimensions_js = """
+            () => {
+                return new Promise((resolve) => {
+                    const filterImage = (img) => {
+                        // Filter out images that are too small
+                        if (img.width < 100 && img.height < 100) return false;
+                        
+                        // Filter out images that are not visible
+                        const rect = img.getBoundingClientRect();
+                        if (rect.width === 0 || rect.height === 0) return false;
+                        
+                        // Filter out images with certain class names (e.g., icons, thumbnails)
+                        if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
+                        
+                        // Filter out images with certain patterns in their src (e.g., placeholder images)
+                        if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
+                        
+                        return true;
+                    };
+
+                    const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
+                    let imagesLeft = images.length;
+                    
+                    if (imagesLeft === 0) {
+                        resolve();
+                        return;
+                    }
+
+                    const checkImage = (img) => {
+                        if (img.complete && img.naturalWidth !== 0) {
+                            img.setAttribute('width', img.naturalWidth);
+                            img.setAttribute('height', img.naturalHeight);
+                            imagesLeft--;
+                            if (imagesLeft === 0) resolve();
+                        }
+                    };
+
+                    images.forEach(img => {
+                        checkImage(img);
+                        if (!img.complete) {
+                            img.onload = () => {
+                                checkImage(img);
+                            };
+                            img.onerror = () => {
+                                imagesLeft--;
+                                if (imagesLeft === 0) resolve();
+                            };
+                        }
+                    });
+
+                    // Fallback timeout of 5 seconds
+                    setTimeout(() => resolve(), 5000);
+                });
+            }
+            """
+            await page.evaluate(update_image_dimensions_js)
+
+            # Wait a bit for any onload events to complete
+            await page.wait_for_timeout(100)
+
+            # Process iframes
+            if kwargs.get("process_iframes", False):
+                page = await self.process_iframes(page)
+
             await self.execute_hook('before_retrieve_html', page)
             # Check if delay_before_return_html is set then wait for that time
             delay_before_return_html = kwargs.get("delay_before_return_html")
             if delay_before_return_html:
                 await asyncio.sleep(delay_before_return_html)
+
             html = await page.content()
             await self.execute_hook('before_return_html', page, html)
 
@@ -398,7 +514,6 @@ async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = Non
         except Error as e:
             raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
 
-
     async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
         semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
         semaphore = asyncio.Semaphore(semaphore_count)

diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py
@@ -16,8 +16,6 @@
     CustomHTML2Text
 )
 
-
-
 class ContentScrappingStrategy(ABC):
     @abstractmethod
     def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
@@ -129,7 +127,7 @@ def fetch_image_file_size(img, base_url):
                 image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
                 image_format = os.path.splitext(img.get('src',''))[1].lower()
                 # Remove . from format
-                image_format = image_format.strip('.')
+                image_format = image_format.strip('.').split('?')[0]
                 score = 0
                 if height_value:
                     if height_unit == 'px' and height_value > 150:
@@ -158,6 +156,7 @@ def fetch_image_file_size(img, base_url):
                 return None
             return {
                 'src': img.get('src', ''),
+                'data-src': img.get('data-src', ''),
                 'alt': img.get('alt', ''),
                 'desc': find_closest_parent_with_useful_text(img),
                 'score': score,
@@ -275,11 +274,14 @@ def flatten_nested_elements(node):
                 # Replace base64 data with empty string
                 img['src'] = base64_pattern.sub('', src)
         cleaned_html = str(body).replace('\n\n', '\n').replace('  ', ' ')
-        cleaned_html = sanitize_html(cleaned_html)
 
         h = CustomHTML2Text()
         h.ignore_links = True
-        markdown = h.handle(cleaned_html)
+        h.body_width = 0
+        try:
+            markdown = h.handle(cleaned_html)
+        except Exception as e:
+            markdown = h.handle(sanitize_html(cleaned_html))
         markdown = markdown.replace('    ```', '```')
 
         try:
@@ -288,6 +290,7 @@ def flatten_nested_elements(node):
             print('Error extracting metadata:', str(e))
             meta = {}
 
+        cleaned_html = sanitize_html(cleaned_html)
         return {
             'markdown': markdown,
             'cleaned_html': cleaned_html,

diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py
@@ -1,4 +1,4 @@
-PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage:
+PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
 <url>{URL}</url>
 
 And here is the cleaned HTML content of that webpage:
@@ -79,7 +79,7 @@
 2. For each block:
    a. Assign it an index based on its order in the content.
    b. Analyze the content and generate ONE semantic tag that describe what the block is about.
-   c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
+   c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
 
 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.