11
11
This directory stores browsing data, including cookies, cache, and user profiles,
and Chrome doesn't allow multiple instances to use the same directory
simultaneously to avoid data corruption.
Suggested Changes
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import random
import os
base_url = "https://incometaxindia.gov.in/Pages/utilities/exempted-
institutions.aspx"
all_records = []
search_chars = [chr(i) for i in range(ord('a'), ord('z')+1)]
try:
for char in tqdm(search_chars, desc="Processing characters"):
driver.get(base_url)
time.sleep(random.uniform(1, 3)) # Random delay
try:
name_field = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH,
"//input[@placeholder='Name']"))
)
name_field.send_keys(char)
search_button = driver.find_element(By.XPATH,
"//input[@value='Search']")
search_button.click()
time.sleep(3)
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
records = soup.find_all('div', class_='record-item') # Adjust
class
all_records.append(record_data)
try:
next_button = driver.find_element(By.XPATH,
"//a[contains(text(), 'Next')]")
if 'disabled' in next_button.get_attribute('class'):
break
next_button.click()
time.sleep(random.uniform(2, 4))
except:
break
except Exception as e:
print(f"Error processing character {char}: {e}")
continue
finally:
# Ensure driver quits even if error occurs
driver.quit()
# Save data
df = pd.DataFrame(all_records)
df.to_csv('exempted_institutions.csv', index=False)
print("Scraping completed. Data saved.")
Use code with caution
Explanation of Changes:
0s
completed at 12:03 PM