网络爬虫之解析网页BeautifulSoup
BeautifulSoup的基本用法
安装bs4 和 lxml
bs4里面含有 BeautifulSoup
from bs4 import BeautifulSoup
把官网的列子复制过来做案例
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
需要实列化BeautifulSoup对象
#需要实列化BeautifulSoup对象
soup = BeautifulSoup(html_doc, "lxml")
格式化输出(把HTML按照严格的缩进格式输出)
soup = BeautifulSoup(html_doc, "lxml")
格式化输出(把HTML按照严格的缩进格式输出)
# print(soup)
# print(soup.prettify())
获取标签
#获取标签
# tag = soup.title
# print(tag) <title>The Dormouse's story</title>
只拿到标签名
只拿到标签名
# name = tag.name
# print(name) title
拿到正文的内容
拿到正文的内容
# str = tag.string
# print(str) The Dormouse's story
在有多个相同的值的时候拿到第一条
在有多个相同的值的时候拿到第一条
tag = soup.p
# print(tag) <p class="title"><b>The Dormouse's story</b></p>(重上往下)
找所有p
找所有p
tags = soup.find_all("p")
# print(len(tags)) 3
指定所定的属性值 格式 属性名
# 指定所定的属性值 格式 属性名:属性值 attrs={"class": "story"}
tags = soup.find_all("p", attrs={"class": "story"})
# print(len(tags)) 2
精确查找
# 精确查找 find 注: 此时要是有class 必定要改成class_ 因为与关键词重名了
tag = soup.find(class_="title")
# print(tag) <p class="title"><b>The Dormouse's story</b></p>
查找它的上一节点
查找它的上一节点
tag = soup.title
# print(tag.parent) <head><title>The Dormouse's story</title></head>
查找它的上一节点的标签名
# 查找它的上一节点的标签名
# print(tag.parent.name)
父节点的父节点
父亲的父亲
# print(tag.parent.parent.name)
# print(tag.parent.parent.parent.name) [document] 最外面的是[document]所有类的父类
根据下标来 拿标签的值
# 根据下标来 拿标签的值
tag = soup.p
# ① 根据class 拿 标签值
str = tag.get("class")
# print(str) ['title'] 数组表示有多个值
# ②
str = soup.p["class"]
# print(str) ['title']
str = soup.a.get("id")
# print(str) link1 id只有一个没有数组 默认第一个
获取a标签的所有属性值
# 获取a标签的所有属性值
strs = soup.a.attrs
# print(strs) {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'} 以键值对的方式打印
三种拿去方式
1:
# 向下获取
# tag = soup.body
# print(tag) <title>The Dormouse's story</title>
# print(tag.contents) ["The Dormouse's story"] 获取下一级
# print(tag.children) <list_iterator object at 0x0000022681D54088> 地址(遍历)获取下一级
# for c in tag.children:
# print(c) The Dormouse's story
# print(tag.descendants) 子孙节点
# for c in tag.descendants:
# print(c) #查询到所有子孙节点 空格逗号都是
# print("*"*20)
2:
# 从下往上找上一节点
# print(soup.title.parent.name) head
# print(soup.title.parents.name)
# 向上查找所有父节点
# for p in soup.title.parents:
# print(p.name)
# head
# html
# [document]
3:
# 平行拿取
tag = soup.a
# print(tag.next_sibling.next_sibling.name) a
BeautifulSoup综合案例:
爬取“猫眼电影的排行榜”
"""
BeautifulSoup综合案例:爬取“猫眼电影的排行榜”
"""
import requests
from bs4 import BeautifulSoup
import os
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
response = requests.get("https://maoyan.com/board/4", headers=headers)
if response.status_code == 200:
print(response.text)
# 解析网页
soup = BeautifulSoup(response.text, "lxml")
imgTag = soup.find_all("img", attrs={"class": "board-img"})
# 获取当前根目录
root = os.getcwd()
# 在根目录中创建文件夹"第一页"
os.mkdir("第一页")
# 改变但当前目录
for imgTags in imgTag:
# print(imgTags) 注: 在这个时候可能打印的和原本的不一致 所有需要打印出来对印上
name = imgTags.get("alt")
src = imgTags.get("data-src")
# 在次下载一次
resp = requests.get(src,headers=headers)
# 保存
with open(f"{name}.png", "wb") as f:
f.write(resp.content)
print(f"{name}{src} 保存成功")
爬取"最好大学网"排行
"""
BeautifulSoup综合案例
爬取"最好大学网"排行
"""
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}
response = requests.get("http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html", headers=headers)
response.encoding = "utf-8"
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
trTags = soup.find_all("tr", attrs={"class": "alt"})
for trTag in trTags:
id = trTag.contents[0].string
name = trTag.contents[1].string
addr = trTag.contents[2].string
sro = trTag.contents[3].string
print(f"{id} {name} {addr} {sro}")