import requests
import csv
import time
import json
from urllib.parse import quote
class WeiboCommentSpider:
def __init__(self):
self.headers = {
'Cookie': 'SCF=AvNaC-yoAZgGhIJXnWWnzLZULdN-SvZo--eSuhRzMO_PIwasCHQEJft-Oixnam7I9JsN7fPfHauIZu0S2bFpsm4.; SINAGLOBAL=6789592846111.761.1746428994697; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5_RVZP51uTS_INkIiodohZ5JpX5KMhUgL.FoqpS0e4Shq7Soq2dJLoIp7LxKqL1KnLB-BLxKqL1KnLB-xki--fiK.fiKyW; ALF=1750776292; SUB=_2A25FN1y0DeRhGeBP7FEY9CjMzTqIHXVmTdB8rDV8PUJbkNAbLVjfkW1NRS_rOYlA-AlgWKNAESqKp5vr3L8Yb0oF; _s_tentry=www.weibo.com; Apache=3376029741618.9136.1748313163542; ULV=1748313163615:8:3:1:3376029741618.9136.1748313163542:1747578705000', # 替换为实际有效的Cookie
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://weibo.com/'
}
self.session = requests.Session()
def clean_text(self, text):
"""清洗文本数据"""
text = text.replace('\n', ' ').replace('\r', ' ')
text = ' '.join(text.split()) # 去除多余空格
return text.strip()
def get_comments(self, weibo_id, max_pages=10):
"""获取微博评论"""
base_url = 'https://weibo.com/ajax/statuses/buildComments'
max_id = ''
page = 1
all_comments = []
with open(f'weibo_{weibo_id}_comments.csv', 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, fieldnames=[
'用户ID', '昵称', '性别', '地区',
'点赞数', '回复数', '评论内容', '发布时间'
])
writer.writeheader()
while page <= max_pages:
params = {
'flow': 0,
'is_reload': 1,
'id': weibo_id,
'is_show_bulletin': 2,
'is_mix': 0,
'count': 20, # 每页数量
'max_id': max_id if max_id else 0,
'uid': '', # 可填写用户ID
'fetch_level': 0
}
try:
print(f'正在获取第 {page} 页评论...')
response = self.session.get(base_url, params=params, headers=self.headers)
response.raise_for_status()
data = response.json()
if not data.get('data'):
print('没有更多评论了')
break
for comment in data['data']:
user = comment.get('user', {})
cleaned_text = self.clean_text(comment.get('text_raw', ''))
row = {
'用户ID': user.get('id', ''),
'昵称': user.get('screen_name', ''),
'性别': '女' if user.get('gender') == 'f' else '男' if user.get('gender') == 'm' else '未知',
'地区': user.get('location', '未知'),
'点赞数': comment.get('like_counts', 0),
'回复数': comment.get('total_number', 0),
'评论内容': cleaned_text,
'发布时间': comment.get('created_at', '')
}
all_comments.append(row)
writer.writerow(row)
max_id = data.get('max_id')
if not max_id or max_id == 0:
print('已获取所有评论')
break
page += 1
time.sleep(2) # 礼貌性延迟
except Exception as e:
print(f'获取第 {page} 页评论失败: {str(e)}')
break
print(f'共获取 {len(all_comments)} 条评论')
return all_comments
if __name__ == '__main__':
spider = WeiboCommentSpider()
# 替换为你要爬取的微博ID,如:https://weibo.com/123456789/ABCDEFGHIJ
weibo_id = 'PoqAaAhNv'
comments = spider.get_comments(weibo_id, max_pages=10)
# 可选:保存为JSON
with open(f'weibo_{weibo_id}_comments.json', 'w', encoding='utf-8') as f:
json.dump(comments, f, ensure_ascii=False, indent=2)
Python爬取微博评论代码
最新推荐文章于 2025-08-05 17:20:40 发布