diff --git a/Crawer/README.MD b/Crawer/README.MD deleted file mode 100644 index b73228f3..00000000 --- a/Crawer/README.MD +++ /dev/null @@ -1,2 +0,0 @@ -# 代码详细说明请看文章 - diff --git a/Crawer/meizitu.py b/Crawer/meizitu.py deleted file mode 100644 index e26a83d6..00000000 --- a/Crawer/meizitu.py +++ /dev/null @@ -1,77 +0,0 @@ -import requests -import os -import time -import threading -from bs4 import BeautifulSoup - - -def download_page(url): - ''' - 用于下载页面 - ''' - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} - r = requests.get(url, headers=headers) - r.encoding = 'gb2312' - return r.text - - -def get_pic_list(html): - ''' - 获取每个页面的套图列表,之后循环调用get_pic函数获取图片 - ''' - soup = BeautifulSoup(html, 'html.parser') - pic_list = soup.find_all('li', class_='wp-item') - for i in pic_list: - a_tag = i.find('h3', class_='tit').find('a') - link = a_tag.get('href') - text = a_tag.get_text() - get_pic(link, text) - - -def get_pic(link, text): - ''' - 获取当前页面的图片,并保存 - ''' - html = download_page(link) # 下载界面 - soup = BeautifulSoup(html, 'html.parser') - pic_list = soup.find('div', id="picture").find_all('img') # 找到界面所有图片 - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} - create_dir('pic/{}'.format(text)) - for i in pic_list: - pic_link = i.get('src') # 拿到图片的具体 url - r = requests.get(pic_link, headers=headers) # 下载图片,之后保存到文件 - with open('pic/{}/{}'.format(text, link.split('/')[-1]), 'wb') as f: - f.write(r.content) - time.sleep(1) # 休息一下,不要给网站太大压力,避免被封 - - -def create_dir(name): - if not os.path.exists(name): - os.makedirs(name) - - -def execute(url): - page_html = download_page(url) - get_pic_list(page_html) - - -def main(): - create_dir('pic') - queue = [i for i in range(1, 72)] # 构造 url 链接 页码。 - threads = [] - while len(queue) > 0: - for thread in threads: - if not thread.is_alive(): - threads.remove(thread) - while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5 - cur_page = queue.pop(0) - url = 'http://meizitu.com/a/more_{}.html'.format(cur_page) - thread = threading.Thread(target=execute, args=(url,)) - thread.setDaemon(True) - thread.start() - print('{}正在下载{}页'.format(threading.current_thread().name, cur_page)) - threads.append(thread) - - -if __name__ == '__main__': - main() diff --git a/Crawer/qiubai_crawer.py b/Crawer/qiubai_crawer.py deleted file mode 100644 index e37e7e7d..00000000 --- a/Crawer/qiubai_crawer.py +++ /dev/null @@ -1,54 +0,0 @@ -import requests -from bs4 import BeautifulSoup - - -def download_page(url): - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} - r = requests.get(url, headers=headers) - return r.text - - -def get_content(html, page): - output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" - soup = BeautifulSoup(html, 'html.parser') - con = soup.find(id='content-left') - con_list = con.find_all('div', class_="article") - for i in con_list: - author = i.find('h2').string # 获取作者名字 - content = i.find('div', class_='content').find('span').get_text() # 获取内容 - stats = i.find('div', class_='stats') - vote = stats.find('span', class_='stats-vote').find('i', class_='number').string - comment = stats.find('span', class_='stats-comments').find('i', class_='number').string - author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 - if author_info is not None: # 非匿名用户 - class_list = author_info['class'] - if "womenIcon" in class_list: - gender = '女' - elif "manIcon" in class_list: - gender = '男' - else: - gender = '' - age = author_info.string # 获取年龄 - else: # 匿名用户 - gender = '' - age = '' - - save_txt(output.format(page, author, gender, age, vote, comment, content)) - - -def save_txt(*args): - for i in args: - with open('qiubai.txt', 'a', encoding='utf-8') as f: - f.write(i) - - -def main(): - # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, - # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 - for i in range(1, 14): - url = 'https://qiushibaike.com/text/page/{}'.format(i) - html = download_page(url) - get_content(html, i) - -if __name__ == '__main__': - main() diff --git "a/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" "b/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" new file mode 100644 index 00000000..1b23c160 --- /dev/null +++ "b/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" @@ -0,0 +1,28 @@ +def wake_up(request, mac='DC-4A-3E-78-3E-0A'): + MAC = mac + BROADCAST = "192.168.0.255" + if len(MAC) != 17: + raise ValueError("MAC address should be set as form 'XX-XX-XX-XX-XX-XX'") + mac_address = MAC.replace("-", '') + data = ''.join(['FFFFFFFFFFFF', mac_address * 20]) # 构造原始数据格式 + send_data = b'' + + # 把原始数据转换为16进制字节数组, + for i in range(0, len(data), 2): + send_data = b''.join([send_data, struct.pack('B', int(data[i: i + 2], 16))]) + print(send_data) + + # 通过socket广播出去,为避免失败,间隔广播三次 + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) + sock.sendto(send_data, (BROADCAST, 7)) + time.sleep(1) + sock.sendto(send_data, (BROADCAST, 7)) + time.sleep(1) + sock.sendto(send_data, (BROADCAST, 7)) + return HttpResponse() + print("Done") + except Exception as e: + return HttpResponse() + print(e) \ No newline at end of file diff --git "a/Python \351\273\221\351\255\224\346\263\225/README.MD" "b/Python \351\273\221\351\255\224\346\263\225/README.MD" new file mode 100644 index 00000000..248602a7 --- /dev/null +++ "b/Python \351\273\221\351\255\224\346\263\225/README.MD" @@ -0,0 +1,4 @@ +# 代码详细说明请看文章 + +[Python 远程关机](https://mp.weixin.qq.com/s/RSod4XWxyzL32eNcrXLjUQ) + diff --git a/README.md b/README.md index 8362e8a9..005335e5 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,61 @@ -### 这是我日常遇到的一些小问题的解决办法,全部是基于Python3 -1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) +# 欢迎关注我的微信公众号【智能制造社区】 -2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) +## 左手代码,右手制造,分享智能制造相关技术和业务,包括 Python, C#, 数据库,工业大数据、物联网技术及MES/ERP/SAP等系统。 -3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) +## 可以通过微信公众号加我好友 +![二维码](qrcode.jpg) -4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) +# 内容列表 -5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) +## [Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat) -6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) +- ### Python 微信公众号开发—小白篇(一) -7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) +- ### Python 公众号开发—颜值检测 -8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) +## [Python 爬虫入门合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88) -9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) +- ### Python 爬虫入门(一)——爬取糗事百科 -10.[Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat) +- ### Python 爬虫入门(二)——爬取妹子图 -11.[Python爬虫入门(一)——爬取糗百](https://github.com/injetlee/Python/blob/master/qiubai_crawer.py) +- ### Python 爬虫——Python 岗位分析报告 -12.[Python爬虫入门教程合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88) +- ### Python 爬虫利器——Selenium介绍 + +- ### Python 爬虫—— 抖音 App 视频抓包爬取 + +## [Python 黑魔法](https://github.com/injetlee/Python/tree/master/Python%20%E9%BB%91%E9%AD%94%E6%B3%95) + +- ### Python 远程关机 + +## SQL 数据库 + +- [1 小时 SQL 极速入门(一)](https://mp.weixin.qq.com/s/Lx4B349OlD49ihJPnB6YiA) +- [1 小时 SQL 极速入门(二)](https://mp.weixin.qq.com/s/D-CEtGYomne5kV_Ji4lodA) +- [1 小时 SQL 极速入门(三)](https://mp.weixin.qq.com/s/7aJqrhCNcvnt2gO3p5P50Q) +- [SQL 高级查询——(层次化查询,递归)](https://mp.weixin.qq.com/s/R9Yldd-5AK4ObRA9Lfbz-Q) +- [GROUP BY高级查询,ROLLUP,CUBE,GROUPPING详解](https://mp.weixin.qq.com/s/_OK6dtHGhp7ukC2pe1ginQ) +- [SQL 行转列,列转行](https://mp.weixin.qq.com/s/xOFIg42FQhNpyg94ajhtqQ) + +## 其他 + +- 1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) + +- 2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) + +- 3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) + +- 4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) + +- 5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) + +- 6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) + +- 7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) + +- 8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) + +- 9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) diff --git a/qrcode.jpg b/qrcode.jpg new file mode 100644 index 00000000..1c2532c3 Binary files /dev/null and b/qrcode.jpg differ diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" "b/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" index da24341a..4a64f572 100644 --- "a/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" @@ -1,6 +1,11 @@ # 代码详细说明请看文章 -![Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw) +[Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw) -![Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew) +[Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew) +[Python 爬虫——Python 岗位分析报告](https://mp.weixin.qq.com/s/8wAHBPnQMbcrP9La7WZiJA) + +[Python 爬虫利器——Selenium介绍](https://mp.weixin.qq.com/s/YJGjZkUejEos_yJ1ukp5kw) + +[Python 爬虫——抖音App视频抓包](https://mp.weixin.qq.com/s/a8Tky_u1u0A4vbssnAK2_g) \ No newline at end of file diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" "b/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" new file mode 100644 index 00000000..e0982308 --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" @@ -0,0 +1,83 @@ +import random +import time + +import requests +from openpyxl import Workbook +import pymysql.cursors + + +def get_conn(): + '''建立数据库连接''' + conn = pymysql.connect(host='localhost', + user='root', + password='root', + db='python', + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + return conn + + +def insert(conn, info): + '''数据写入数据库''' + with conn.cursor() as cursor: + sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute(sql, info) + conn.commit() + + +def get_json(url, page, lang_name): + '''返回当前页面的信息列表''' + headers = { + 'Host': 'www.lagou.com', + 'Connection': 'keep-alive', + 'Content-Length': '23', + 'Origin': 'https://www.lagou.com', + 'X-Anit-Forge-Code': '0', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'X-Requested-With': 'XMLHttpRequest', + 'X-Anit-Forge-Token': 'None', + 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' + } + data = {'first': 'false', 'pn': page, 'kd': lang_name} + json = requests.post(url, data, headers=headers).json() + list_con = json['content']['positionResult']['result'] + info_list = [] + for i in list_con: + info = [] + info.append(i.get('companyShortName', '无')) + info.append(i.get('companyFullName', '无')) + info.append(i.get('industryField', '无')) + info.append(i.get('companySize', '无')) + info.append(i.get('salary', '无')) + info.append(i.get('city', '无')) + info.append(i.get('education', '无')) + info_list.append(info) + return info_list + + +def main(): + lang_name = 'python' + wb = Workbook() # 打开 excel 工作簿 + conn = get_conn() # 建立数据库连接 不存数据库 注释此行 + for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市 + page = 1 + ws1 = wb.active + ws1.title = lang_name + url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i) + while page < 31: # 每个城市30页信息 + info = get_json(url, page, lang_name) + page += 1 + print(i, 'page', page) + time.sleep(random.randint(10, 20)) + for row in info: + insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行 + ws1.append(row) + conn.close() # 关闭数据库连接,不存数据库 注释此行 + wb.save('{}职位信息.xlsx'.format(lang_name)) + +if __name__ == '__main__': + main() \ No newline at end of file