Skip to content

Commit e8f3407

Browse files
author
huayang wang
committed
东方财富网爬虫
1 parent 9d478b4 commit e8f3407

File tree

8 files changed

+1337
-0
lines changed

8 files changed

+1337
-0
lines changed

finance.eastmoney.com/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
之前帮客户做的爬虫, 爬取东方财富网的每日的股票、可转债的数据
2+
3+
网址:http://finance.eastmoney.com/
4+
5+
数据:每日运行生成一个csv文件
6+
7+
博客地址:https://blog.csdn.net/weixin_43746433
8+
9+
测试:代码截止2020/04/23测试无误

finance.eastmoney.com/__init__.py

Whitespace-only changes.

finance.eastmoney.com/可还债/__init__.py

Whitespace-only changes.

finance.eastmoney.com/可还债/id20200424.csv

Lines changed: 361 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import re
2+
import os
3+
import requests
4+
import json
5+
import pandas as pd
6+
7+
pd.set_option('display.max_columns', None)
8+
pd.set_option('display.max_rows', None)
9+
pd.set_option('display.unicode.ambiguous_as_wide', True)
10+
pd.set_option('display.unicode.east_asian_width', True)
11+
pd.set_option('display.width', 5000)
12+
headers = {
13+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
14+
# 功能:解析可还债网页
15+
def HTML(url,time_,time_str):
16+
gupiao_list = []
17+
try:
18+
# 解析网页
19+
r = requests.get(url, headers=headers, timeout=30)
20+
r.raise_for_status()
21+
r.encoding = r.apparent_encoding
22+
html = r.text
23+
except Exception as e:
24+
print("wrong:" + e)
25+
pass
26+
pat = re.compile("\[\{.*?\}\]")
27+
data = pat.findall(html)
28+
# 转换为json格式
29+
js = json.loads(data[0])
30+
# 循环写入数据
31+
for i in range(len(js)):
32+
# print(js[i])
33+
if js[i]["LISTDATE"] != "-":
34+
print(str(js[i]["BONDCODE"]))
35+
time.sleep(1)
36+
lilv,jinkia,zhenfu,zuigao,zuidi,zuoshou,chengjiaoliang,chengjiaoe =get_data(str(js[i]["BONDCODE"]),time_)
37+
print( lilv, jinkia, zhenfu, zuigao, zuidi, zuoshou, chengjiaoliang, chengjiaoe)
38+
list = [lilv, jinkia, zuigao,zuidi,zuoshou]
39+
if lilv>1000:
40+
lilv = lilv /10
41+
if jinkia > 1000:
42+
jinkia = jinkia / 10
43+
if zuigao > 1000:
44+
zuigao = zuigao / 10
45+
if zuidi > 1000:
46+
zuidi = zuidi / 10
47+
if zuoshou > 1000:
48+
zuoshou = zuoshou / 10
49+
50+
# time.sleep(1)
51+
data=(js[i]["SNAME"], js[i]["BONDCODE"], js[i]["CORRESCODE"], js[i]["STARTDATE"],lilv,jinkia,zhenfu,zuigao,zuidi,zuoshou,chengjiaoliang,chengjiaoe)
52+
gupiao_list.append(data)
53+
title = ["债券简称","债券代码", "正股代码","上市时间","现价","今开","振幅","最高","最低","昨收","成交量","成交额"]
54+
df = pd.DataFrame(gupiao_list, columns=title)
55+
to_csv(df, f"id{time_str}.csv")
56+
57+
# 保存csv格式的文件夹
58+
def to_csv(df, csv_file):
59+
if os.path.exists(csv_file) == False:
60+
df.to_csv(csv_file, index=False)
61+
else:
62+
df.to_csv(csv_file, mode='a+', header=False, index=False)
63+
64+
65+
# 获取每个可还债的数据
66+
# 输入:可还债id,与时间
67+
# 输出:"现价","今开","振幅","最高","最低","昨收","成交量","成交额" 的数据
68+
def get_data(id,time_):
69+
url ="http://push2.eastmoney.com/api/qt/stock/get?secid=1."+str(id)+"&ut=bd1d9ddb04089700cf9c27f6f7426281&fields=f43,f169,f170,f46,f60,f84,f116,f44,f45,f171,f126,f47,f48,f168,f164,f49,f161,f55,f92,f59,f152,f167,f50,f86,f71,f172,f182,f191,f192,f532&cb=jQuery1124021434030444820706_"+str(time_)+"000"+"&type=CT&cmd=1280922&sty=FDPBPFB&st=z&js=((x))&token=4f1862fc3b5e77c150a2b985b12db0fd&_="+str(time_)+"000"
70+
# url = "http://push2.eastmoney.com/api/qt/stock/get?secid=1."+str(id)+"&ut=bd1d9ddb04089700cf9c27f6f7426281&fields=f43,f169,f170,f46,f60,f84,f116,f44,f45,f171,f126,f47,f48,f168,f164,f49,f161,f55,f92,f59,f152,f167,f50,f86,f71,f172,f182,f191,f192,f532&cb=jQuery1124002590768518466602_1588042202265&type=CT&cmd=1230442&sty=FDPBPFB&st=z&js=((x))&token=4f1862fc3b5e77c150a2b985b12db0fd&_=1588042202266"
71+
print(url)
72+
try:
73+
r = requests.get(url, headers=headers, timeout=30)
74+
r.raise_for_status()
75+
r.encoding = r.apparent_encoding
76+
html = r.text
77+
except Exception as e:
78+
print("wrong:" + e)
79+
print(html)
80+
try:
81+
try:
82+
# 获取内容
83+
list_ = []
84+
pat = re.compile("({.*?\})")
85+
data = pat.findall(html+"}}")
86+
d =eval(data[0]+"}")
87+
new_data = d.get("data")
88+
# print(new_data)
89+
lilv= new_data.get("f43") # 利率
90+
jinkia = new_data.get("f46") # 今开
91+
zhenfu=new_data.get("f171") # 振幅
92+
zuigao = new_data.get("f44") # 最高
93+
zuidi = new_data.get("f45") # 最低
94+
zuoshou = new_data.get("f60") # 昨收
95+
chengjiaoliang = new_data.get("f47")
96+
chengjiaoe = new_data.get("f48")
97+
return round(float(lilv)/100,2), round(float(jinkia)/100,2), str(round(float(zhenfu),2)) +"%", round(float(zuigao)/100,2), round(float(zuidi)/100,2), round(float(zuoshou)/100,2), chengjiaoliang, chengjiaoe
98+
except:
99+
print("*" * 100)
100+
# url = "http://push2.eastmoney.com/api/qt/stock/get?secid=0." + str(
101+
# id) + "&ut=bd1d9ddb04089700cf9c27f6f7426281&fields=f43,f169,f170,f46,f60,f84,f116,f44,f45,f171,f126,f47,f48,f168,f164,f49,f161,f55,f92,f59,f152,f167,f50,f86,f71,f172,f182,f191,f192,f532&cb=jQuery1124021434030444820706_" + str(
102+
# time_) + "000" + "&type=CT&cmd=1280922&sty=FDPBPFB&st=z&js=((x))&token=4f1862fc3b5e77c150a2b985b12db0fd&_=" + str(
103+
# time_) + "000"
104+
url = "http://push2.eastmoney.com/api/qt/stock/get?secid=1." + str(
105+
id) + "&ut=bd1d9ddb04089700cf9c27f6f7426281&fields=f43,f169,f170,f46,f60,f84,f116,f44,f45,f171,f126,f47,f48,f168,f164,f49,f161,f55,f92,f59,f152,f167,f50,f86,f71,f172,f182,f191,f192,f532&cb=jQuery1124021434030444820706_" + str(
106+
time_) + "000" + "&type=CT&cmd=1280922&sty=FDPBPFB&st=z&js=((x))&token=4f1862fc3b5e77c150a2b985b12db0fd&_=" + str(
107+
time_) + "000"
108+
try:
109+
r = requests.get(url, headers=headers, timeout=30)
110+
r.raise_for_status()
111+
r.encoding = r.apparent_encoding
112+
html = r.text
113+
except Exception as e:
114+
print("wrong:" + e)
115+
# print(html)
116+
pat = re.compile("({.*?\})")
117+
data = pat.findall(html + "}}")
118+
d = eval(data[0] + "}")
119+
new_data = d.get("data")
120+
lilv = new_data.get("f43") # 利率
121+
jinkia = new_data.get("f46") # 今开
122+
zhenfu = new_data.get("f171") # 振幅
123+
zuigao = new_data.get("f44") # 最高
124+
zuidi = new_data.get("f45") # 最低
125+
zuoshou = new_data.get("f60") # 昨收
126+
chengjiaoliang = new_data.get("f47")
127+
chengjiaoe = new_data.get("f48")
128+
# print( lilv, jinkia, zhenfu, zuigao, zuidi, zuoshou, chengjiaoliang, chengjiaoe)
129+
return round(float(lilv)/100,2), round(float(jinkia)/100,2), str(round(float(zhenfu),2)) +"%", round(float(zuigao)/100,2), round(float(zuidi)/100,2), round(float(zuoshou)/100,2), chengjiaoliang, chengjiaoe
130+
except:
131+
return 0,0,0,0,0,0,0,0
132+
133+
import time
134+
# 获取可还债网页的所有内容
135+
# 输入:时间类型
136+
# 输出: csv文件
137+
def main(time_,time_str):
138+
for i in range(1, 9):
139+
url = "http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?type=KZZ_LB2.0&token=70f12f2f4f091e459a279469fe49eca5&cmd=&st=STARTDATE&sr=-1&p=" + str(
140+
i) + "&ps=50&js=var%20GCPJwVrm={pages:(tp),data:(x),font:(font)}&rt=52919237"
141+
print(url)
142+
HTML(url,time_,time_str)
143+
time.sleep(2)
144+
145+
# 字符类型的时间:
146+
def get_time(time_str):
147+
# 转为时间数组
148+
timeArray = time.strptime(time_str, "%Y%m%d")
149+
# 转为时间戳
150+
timeStamp = int(time.mktime(timeArray))
151+
return timeStamp
152+
153+
if __name__ == '__main__':
154+
# 获取当前时间
155+
for time_str in ["20200426"]:
156+
time_ = get_time(time_str)
157+
print("时间戳",time_)
158+
# 主程序入口
159+
main(time_,time_str)
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import re
2+
import os
3+
import requests
4+
import json
5+
import pandas as pd
6+
7+
pd.set_option('display.max_columns', None)
8+
pd.set_option('display.max_rows', None)
9+
pd.set_option('display.unicode.ambiguous_as_wide', True)
10+
pd.set_option('display.unicode.east_asian_width', True)
11+
pd.set_option('display.width', 5000)
12+
13+
# 字符类型的时间:
14+
def get_time(time_str):
15+
# 转为时间数组
16+
timeArray = time.strptime(time_str, "%Y%m%d")
17+
# 转为时间戳
18+
timeStamp = int(time.mktime(timeArray))
19+
return timeStamp
20+
21+
# 坐拥: 解析每个网页的数据
22+
# 输入:字符与每个网页所需的地址,请求的参数
23+
# 输出: 网页解析所获得的股票数据
24+
def HTML(time_str,url, params):
25+
gupiao_list = []
26+
headers = {
27+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
28+
try:
29+
r = requests.get(url, headers=headers, timeout=30, params=params)
30+
r.raise_for_status()
31+
r.encoding = r.apparent_encoding
32+
html = r.text
33+
except Exception as e:
34+
print("wrong:" + e)
35+
# pat = re.compile("\[\{.*?\}\]")
36+
pat = re.compile("({.*?})")
37+
data = pat.findall(html)
38+
# print(data)
39+
js = []
40+
for d in data:
41+
try:
42+
d1=eval(d+"]}}").get("data").get("diff")[0]
43+
except:
44+
d1 = eval(d)
45+
js.append(d1)
46+
for i in range(len(js)):
47+
zhenfu = str(js[i]["f7"]) + "%"
48+
gupiao_list.append((
49+
js[i]["f12"], js[i]["f14"], js[i]["f2"], zhenfu, js[i]["f4"], js[i]["f5"], js[i]["f6"],
50+
zhenfu, js[i]["f15"], js[i]["f16"], js[i]["f17"], js[i]["f18"], js[i]["f10"]))
51+
title = ["代码", "名称", "最新价", "涨跌幅", "涨跌额", "成交量", "成交额",
52+
"振幅", "最高", "最低", "今开", "昨收", "量比"]
53+
df = pd.DataFrame(gupiao_list, columns=title)
54+
to_csv(df, f"result_{time_str}.csv")
55+
56+
# 保存csv图片
57+
def to_csv(df, csv_file):
58+
if os.path.exists(csv_file) == False:
59+
df.to_csv(csv_file, index=False)
60+
else:
61+
df.to_csv(csv_file, mode='a+', header=False, index=False)
62+
63+
64+
import time
65+
# 主函数入
66+
# 输入:时间与时间字符
67+
# 输出:解析网页 所需的header请求
68+
def main(time_str,time_):
69+
time_ = str(time_) +"000"
70+
# 爬出249个网页
71+
for i in range(1, 250):
72+
print(i)
73+
url = 'http://push2.eastmoney.com/api/qt/clist/get'
74+
params = {
75+
'cb': f'jQuery112407955974158503321_{str(time_)}',
76+
'pn': str(i),
77+
'pz': '20',
78+
'po': '1',
79+
'np': '1',
80+
'ut': 'bd1d9ddb04089700cf9c27f6f7426281',
81+
'fltt': '2',
82+
'invt': '2',
83+
'fid': 'f3',
84+
'fs': 'm:0 t:6,m:0 t:13,m:0 t:80,m:1 t:2,m:1 t:23',
85+
'fields': 'f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152',
86+
'_': str(time_)
87+
}
88+
# 解析网页入口
89+
HTML(time_str, url,params)
90+
# 睡眠
91+
time.sleep(6)
92+
93+
94+
if __name__ == '__main__':
95+
# 输入时间
96+
for time_str in ["20200417"]:
97+
time_ = get_time(time_str)
98+
# 程序入口
99+
main(time_str,time_)

0 commit comments

Comments
 (0)