爬着玩的,训练自己的爬虫能力。
以下是除了微信公众号爬取有问题其他内容可以爬取的源代码。
用到了selenium来模拟人为打开浏览器。之前尝试过直接用requests.get(),发现会弹出百度安全验证,加了referer,user-agent等headers都无法爬取,于是采用了这个方法。这个方法实测能用,但爬取速度较慢。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
from bs4 import BeautifulSoup
import requests
import chardet
# 设置Chrome选项
chrome_options = Options()
chrome_options.add_argument("--headless") # 无头模式
chrome_options.add_argument("--disable-gpu")
# 创建Chrome驱动
service = Service('C:\\Users\\lyc666\\Desktop\\SPI\\chromedriver-win64\\chromedriver.exe') # 替换为你的chromedriver路径
driver = webdriver.Chrome(service=service, options=chrome_options)
# 打开百度搜索结果页面
driver.get('https://www.baidu.com/s?wd=%E5%A4%A7%E6%A8%A1%E5%9E%8B%20%E9%93%B6%E8%A1%8C')
# 定义一个函数来提取页面上的新闻链接和标题
def extract_news_links(driver):
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
news_links = []
for h3 in soup.find_all('h3', class_='c-title t t tts-title'):
a_tag = h3.find('a')
if a_tag:
title = a_tag.get_text()
link = a_tag['href']
news_links.append({'title': title, 'link': link, 'referer': driver.current_url})
return news_links
# 提取第一页的新闻链接
all_news_links = extract_news_links(driver)
# 爬取多页
for i in range(2, 6): # 假设你想爬取前5页
try:
next_page_button = driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
next_page_button.click()
time.sleep(2) # 等待页面加载
news_links = extract_news_links(driver)
all_news_links.extend(news_links)
except Exception as e:
print(f"Failed to go to page {i}: {e}")
break
# 爬取详细新闻页面内容
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)