feat: Add hooks for enhanced control over Selenium drivers

- Added six hooks: on_driver_created, before_get_url, after_get_url, before_return_html, on_user_agent_updated. - Included example usage in quickstart.py. - Updated README and changelog.
cassgo · Jun 18, 2024 · 853b9d5 · 853b9d5
1 parent 6d04284
commit 853b9d5
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Changelog
 
+## [0.2.5] - 2024-06-18
+### Added
+- Added five important hooks to the crawler:
+  - on_driver_created: Called when the driver is ready for initializations.
+  - before_get_url: Called right before Selenium fetches the URL.
+  - after_get_url: Called after Selenium fetches the URL.
+  - before_return_html: Called when the data is parsed and ready.
+  - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
+- Added an example in `quickstart.py` in the example folder under the docs.
+
 ## [0.2.4] - 2024-06-17
 ### Fixed
-- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
+- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
diff --git a/README.md b/README.md
@@ -13,6 +13,16 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 
 ## Recent Changes 
 
+### v0.2.5
+- 🌟 Added six important hooks to the crawler:
+  - 🟢 on_driver_created: Called when the driver is ready for initializations.
+  - 🔵 before_get_url: Called right before Selenium fetches the URL.
+  - 🟣 after_get_url: Called after Selenium fetches the URL.
+  - 🟠 before_return_html: Called when the data is parsed and ready.
+  - 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
+- 📄 Added an example in `quickstart.py` in the example folder under the docs.
+
+
 ### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)
 

diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
@@ -104,6 +104,7 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs):
         # Hooks
         self.hooks = {
             'on_driver_created': None,
+            'on_user_agent_updated': None,
             'before_get_url': None,
             'after_get_url': None,
             'before_return_html': None
@@ -114,6 +115,7 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs):
         self.service = Service(chromedriver_autoinstaller.install())
         self.service.log_path = "NUL"
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
+        self.driver = self.execute_hook('on_driver_created', self.driver)
 
     def set_hook(self, hook_type: str, hook: Callable):
         if hook_type in self.hooks:
@@ -137,7 +139,7 @@ def update_user_agent(self, user_agent: str):
         self.options.add_argument(f"user-agent={user_agent}")
         self.driver.quit()
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
-        self.driver = self.execute_hook('on_driver_created', self.driver)
+        self.driver = self.execute_hook('on_user_agent_updated', self.driver)
 
     def set_custom_headers(self, headers: dict):
         # Enable Network domain for sending headers

diff --git a/pages/index.html b/pages/index.html
@@ -25,7 +25,7 @@
         <header class="bg-zinc-950 text-lime-500 py-4 flex">
 
             <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
             </div>
             <div class="mx-auto px-4 flex font-bold text-xl gap-2">
                 <span>📊 Total Website Processed</span>

diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@ def run(self):
 
 setup(
     name="Crawl4AI",
-    version="0.2.4",
+    version="0.2.5",
     description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",