diff --git "a/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" "b/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" new file mode 100644 index 00000000..1b23c160 --- /dev/null +++ "b/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" @@ -0,0 +1,28 @@ +def wake_up(request, mac='DC-4A-3E-78-3E-0A'): + MAC = mac + BROADCAST = "192.168.0.255" + if len(MAC) != 17: + raise ValueError("MAC address should be set as form 'XX-XX-XX-XX-XX-XX'") + mac_address = MAC.replace("-", '') + data = ''.join(['FFFFFFFFFFFF', mac_address * 20]) # 构造原始数据格式 + send_data = b'' + + # 把原始数据转换为16进制字节数组, + for i in range(0, len(data), 2): + send_data = b''.join([send_data, struct.pack('B', int(data[i: i + 2], 16))]) + print(send_data) + + # 通过socket广播出去,为避免失败,间隔广播三次 + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) + sock.sendto(send_data, (BROADCAST, 7)) + time.sleep(1) + sock.sendto(send_data, (BROADCAST, 7)) + time.sleep(1) + sock.sendto(send_data, (BROADCAST, 7)) + return HttpResponse() + print("Done") + except Exception as e: + return HttpResponse() + print(e) \ No newline at end of file diff --git "a/Python \351\273\221\351\255\224\346\263\225/README.MD" "b/Python \351\273\221\351\255\224\346\263\225/README.MD" new file mode 100644 index 00000000..248602a7 --- /dev/null +++ "b/Python \351\273\221\351\255\224\346\263\225/README.MD" @@ -0,0 +1,4 @@ +# 代码详细说明请看文章 + +[Python 远程关机](https://mp.weixin.qq.com/s/RSod4XWxyzL32eNcrXLjUQ) + diff --git a/README.md b/README.md index 0e2bc49e..005335e5 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,61 @@ -### 这是我日常遇到的一些小问题的解决办法,全部是基于Python3 -1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) +# 欢迎关注我的微信公众号【智能制造社区】 -2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) +## 左手代码,右手制造,分享智能制造相关技术和业务,包括 Python, C#, 数据库,工业大数据、物联网技术及MES/ERP/SAP等系统。 -3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) +## 可以通过微信公众号加我好友 +![二维码](qrcode.jpg) -4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) +# 内容列表 -5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) +## [Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat) -6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) +- ### Python 微信公众号开发—小白篇(一) -7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) +- ### Python 公众号开发—颜值检测 -8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) +## [Python 爬虫入门合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88) -9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) +- ### Python 爬虫入门(一)——爬取糗事百科 -10.[Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat) +- ### Python 爬虫入门(二)——爬取妹子图 + +- ### Python 爬虫——Python 岗位分析报告 + +- ### Python 爬虫利器——Selenium介绍 + +- ### Python 爬虫—— 抖音 App 视频抓包爬取 + +## [Python 黑魔法](https://github.com/injetlee/Python/tree/master/Python%20%E9%BB%91%E9%AD%94%E6%B3%95) + +- ### Python 远程关机 + +## SQL 数据库 + +- [1 小时 SQL 极速入门(一)](https://mp.weixin.qq.com/s/Lx4B349OlD49ihJPnB6YiA) +- [1 小时 SQL 极速入门(二)](https://mp.weixin.qq.com/s/D-CEtGYomne5kV_Ji4lodA) +- [1 小时 SQL 极速入门(三)](https://mp.weixin.qq.com/s/7aJqrhCNcvnt2gO3p5P50Q) +- [SQL 高级查询——(层次化查询,递归)](https://mp.weixin.qq.com/s/R9Yldd-5AK4ObRA9Lfbz-Q) +- [GROUP BY高级查询,ROLLUP,CUBE,GROUPPING详解](https://mp.weixin.qq.com/s/_OK6dtHGhp7ukC2pe1ginQ) +- [SQL 行转列,列转行](https://mp.weixin.qq.com/s/xOFIg42FQhNpyg94ajhtqQ) + +## 其他 + +- 1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) + +- 2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) + +- 3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) + +- 4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) + +- 5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) + +- 6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) + +- 7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) + +- 8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) + +- 9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) diff --git a/qiubai_crawer.py b/qiubai_crawer.py new file mode 100644 index 00000000..e37e7e7d --- /dev/null +++ b/qiubai_crawer.py @@ -0,0 +1,54 @@ +import requests +from bs4 import BeautifulSoup + + +def download_page(url): + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + r = requests.get(url, headers=headers) + return r.text + + +def get_content(html, page): + output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" + soup = BeautifulSoup(html, 'html.parser') + con = soup.find(id='content-left') + con_list = con.find_all('div', class_="article") + for i in con_list: + author = i.find('h2').string # 获取作者名字 + content = i.find('div', class_='content').find('span').get_text() # 获取内容 + stats = i.find('div', class_='stats') + vote = stats.find('span', class_='stats-vote').find('i', class_='number').string + comment = stats.find('span', class_='stats-comments').find('i', class_='number').string + author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 + if author_info is not None: # 非匿名用户 + class_list = author_info['class'] + if "womenIcon" in class_list: + gender = '女' + elif "manIcon" in class_list: + gender = '男' + else: + gender = '' + age = author_info.string # 获取年龄 + else: # 匿名用户 + gender = '' + age = '' + + save_txt(output.format(page, author, gender, age, vote, comment, content)) + + +def save_txt(*args): + for i in args: + with open('qiubai.txt', 'a', encoding='utf-8') as f: + f.write(i) + + +def main(): + # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, + # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 + for i in range(1, 14): + url = 'https://qiushibaike.com/text/page/{}'.format(i) + html = download_page(url) + get_content(html, i) + +if __name__ == '__main__': + main() diff --git a/qrcode.jpg b/qrcode.jpg new file mode 100644 index 00000000..1c2532c3 Binary files /dev/null and b/qrcode.jpg differ diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" "b/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" new file mode 100644 index 00000000..4a64f572 --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" @@ -0,0 +1,11 @@ +# 代码详细说明请看文章 + +[Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw) + +[Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew) + +[Python 爬虫——Python 岗位分析报告](https://mp.weixin.qq.com/s/8wAHBPnQMbcrP9La7WZiJA) + +[Python 爬虫利器——Selenium介绍](https://mp.weixin.qq.com/s/YJGjZkUejEos_yJ1ukp5kw) + +[Python 爬虫——抖音App视频抓包](https://mp.weixin.qq.com/s/a8Tky_u1u0A4vbssnAK2_g) \ No newline at end of file diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" "b/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" new file mode 100644 index 00000000..e0982308 --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" @@ -0,0 +1,83 @@ +import random +import time + +import requests +from openpyxl import Workbook +import pymysql.cursors + + +def get_conn(): + '''建立数据库连接''' + conn = pymysql.connect(host='localhost', + user='root', + password='root', + db='python', + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + return conn + + +def insert(conn, info): + '''数据写入数据库''' + with conn.cursor() as cursor: + sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute(sql, info) + conn.commit() + + +def get_json(url, page, lang_name): + '''返回当前页面的信息列表''' + headers = { + 'Host': 'www.lagou.com', + 'Connection': 'keep-alive', + 'Content-Length': '23', + 'Origin': 'https://www.lagou.com', + 'X-Anit-Forge-Code': '0', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'X-Requested-With': 'XMLHttpRequest', + 'X-Anit-Forge-Token': 'None', + 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' + } + data = {'first': 'false', 'pn': page, 'kd': lang_name} + json = requests.post(url, data, headers=headers).json() + list_con = json['content']['positionResult']['result'] + info_list = [] + for i in list_con: + info = [] + info.append(i.get('companyShortName', '无')) + info.append(i.get('companyFullName', '无')) + info.append(i.get('industryField', '无')) + info.append(i.get('companySize', '无')) + info.append(i.get('salary', '无')) + info.append(i.get('city', '无')) + info.append(i.get('education', '无')) + info_list.append(info) + return info_list + + +def main(): + lang_name = 'python' + wb = Workbook() # 打开 excel 工作簿 + conn = get_conn() # 建立数据库连接 不存数据库 注释此行 + for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市 + page = 1 + ws1 = wb.active + ws1.title = lang_name + url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i) + while page < 31: # 每个城市30页信息 + info = get_json(url, page, lang_name) + page += 1 + print(i, 'page', page) + time.sleep(random.randint(10, 20)) + for row in info: + insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行 + ws1.append(row) + conn.close() # 关闭数据库连接,不存数据库 注释此行 + wb.save('{}职位信息.xlsx'.format(lang_name)) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/meizitu.py" "b/\347\210\254\350\231\253\351\233\206\345\220\210/meizitu.py" new file mode 100644 index 00000000..4f6932db --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/meizitu.py" @@ -0,0 +1,77 @@ +import requests +import os +import time +import threading +from bs4 import BeautifulSoup + + +def download_page(url): + ''' + 用于下载页面 + ''' + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + r = requests.get(url, headers=headers) + r.encoding = 'gb2312' + return r.text + + +def get_pic_list(html): + ''' + 获取每个页面的套图列表,之后循环调用get_pic函数获取图片 + ''' + soup = BeautifulSoup(html, 'html.parser') + pic_list = soup.find_all('li', class_='wp-item') + for i in pic_list: + a_tag = i.find('h3', class_='tit').find('a') + link = a_tag.get('href') + text = a_tag.get_text() + get_pic(link, text) + + +def get_pic(link, text): + ''' + 获取当前页面的图片,并保存 + ''' + html = download_page(link) # 下载界面 + soup = BeautifulSoup(html, 'html.parser') + pic_list = soup.find('div', id="picture").find_all('img') # 找到界面所有图片 + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + create_dir('pic/{}'.format(text)) + for i in pic_list: + pic_link = i.get('src') # 拿到图片的具体 url + r = requests.get(pic_link, headers=headers) # 下载图片,之后保存到文件 + with open('pic/{}/{}'.format(text, pic_link.split('/')[-1]), 'wb') as f: + f.write(r.content) + time.sleep(1) # 休息一下,不要给网站太大压力,避免被封 + + +def create_dir(name): + if not os.path.exists(name): + os.makedirs(name) + + +def execute(url): + page_html = download_page(url) + get_pic_list(page_html) + + +def main(): + create_dir('pic') + queue = [i for i in range(1, 72)] # 构造 url 链接 页码。 + threads = [] + while len(queue) > 0: + for thread in threads: + if not thread.is_alive(): + threads.remove(thread) + while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5 + cur_page = queue.pop(0) + url = 'http://meizitu.com/a/more_{}.html'.format(cur_page) + thread = threading.Thread(target=execute, args=(url,)) + thread.setDaemon(True) + thread.start() + print('{}正在下载{}页'.format(threading.current_thread().name, cur_page)) + threads.append(thread) + + +if __name__ == '__main__': + main() diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/qiubai_crawer.py" "b/\347\210\254\350\231\253\351\233\206\345\220\210/qiubai_crawer.py" new file mode 100644 index 00000000..e37e7e7d --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/qiubai_crawer.py" @@ -0,0 +1,54 @@ +import requests +from bs4 import BeautifulSoup + + +def download_page(url): + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + r = requests.get(url, headers=headers) + return r.text + + +def get_content(html, page): + output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" + soup = BeautifulSoup(html, 'html.parser') + con = soup.find(id='content-left') + con_list = con.find_all('div', class_="article") + for i in con_list: + author = i.find('h2').string # 获取作者名字 + content = i.find('div', class_='content').find('span').get_text() # 获取内容 + stats = i.find('div', class_='stats') + vote = stats.find('span', class_='stats-vote').find('i', class_='number').string + comment = stats.find('span', class_='stats-comments').find('i', class_='number').string + author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 + if author_info is not None: # 非匿名用户 + class_list = author_info['class'] + if "womenIcon" in class_list: + gender = '女' + elif "manIcon" in class_list: + gender = '男' + else: + gender = '' + age = author_info.string # 获取年龄 + else: # 匿名用户 + gender = '' + age = '' + + save_txt(output.format(page, author, gender, age, vote, comment, content)) + + +def save_txt(*args): + for i in args: + with open('qiubai.txt', 'a', encoding='utf-8') as f: + f.write(i) + + +def main(): + # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, + # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 + for i in range(1, 14): + url = 'https://qiushibaike.com/text/page/{}'.format(i) + html = download_page(url) + get_content(html, i) + +if __name__ == '__main__': + main()