From 040025845c60ccffa4620bbd3d3391f4548273a1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 13 Jul 2016 21:57:48 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=B1=BB=20=E6=9F=A5?= =?UTF-8?q?=E5=AD=97=E5=85=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/Dict.py | 28 ++++ tools/Dict.window.py | 28 ++++ tools/SetBGPhoto.py | 21 +++ tools/downMp4OfJiKeXueYuan.py | 133 ++++++++++++++++++ .../NL.py" | 125 ++++++++++++++++ .../Readme.md" | 4 + .../disease.py" | 46 ++++++ .../medicamentANDorgThird.py" | 88 ++++++++++++ .../symptom.py" | 47 +++++++ .../yaopintong.py" | 73 ++++++++++ .../yaopintong2.py" | 110 +++++++++++++++ 11 files changed, 703 insertions(+) create mode 100644 tools/Dict.py create mode 100644 tools/Dict.window.py create mode 100644 tools/SetBGPhoto.py create mode 100644 tools/downMp4OfJiKeXueYuan.py create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py" create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md" create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py" create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py" create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py" create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py" create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py" diff --git a/tools/Dict.py b/tools/Dict.py new file mode 100644 index 0000000..8191007 --- /dev/null +++ b/tools/Dict.py @@ -0,0 +1,28 @@ +#!/usr/bin/python +#coding:utf-8 +import urllib +import sys +import re + +if len(sys.argv) == 1: #没有单词就提示用法 + print "用法:./Dict.py 要查找的单词" + sys.exit() + +word = "" +for x in range(len(sys.argv) - 1): #查找的可能是短语,中间有空格,如"join in",这里拼接单词 + word += " " + sys.argv[x + 1] +print "单词:" + word + +searchUrl = "http://dict.youdao.com/search?q=" + word + "&keyfrom=dict.index" #查找的地址 +response = urllib.urlopen(searchUrl).read() #获得查找到的网页源码 + +#从网页源码提取出单词释义那一部分 +searchSuccess = re.search(r"(?s)
\s*
",response) + +if searchSuccess: + means = re.findall(r"(?m)
  • (.*?)
  • ",searchSuccess.group()) #获取我们想提取的核心单词释义 + print "释义:" + for mean in means: + print "\t" + mean #输出释义 +else: + print "未查找到释义." \ No newline at end of file diff --git a/tools/Dict.window.py b/tools/Dict.window.py new file mode 100644 index 0000000..5badd57 --- /dev/null +++ b/tools/Dict.window.py @@ -0,0 +1,28 @@ +#!/usr/bin/python +#coding:utf-8 +import urllib +import sys +import re + +if len(sys.argv) == 1: #û�е��ʾ���ʾ�÷� + print "�÷�:./Dict.py Ҫ���ҵĵ���" + sys.exit() + +word = "" +for x in range(len(sys.argv) - 1): #���ҵĿ����Ƕ���м��пո���"join in",����ƴ�ӵ��� + word += " " + sys.argv[x + 1] +print "���ʣ�" + word + +searchUrl = "http://dict.youdao.com/search?q=" + word + "&keyfrom=dict.index" #���ҵĵ�ַ +response = urllib.urlopen(searchUrl).read() #��ò��ҵ�����ҳԴ�� + +#����ҳԴ����ȡ������������һ���� +searchSuccess = re.search(r"(?s)
    \s*
    ",response) + +if searchSuccess: + means = re.findall(r"(?m)
  • (.*?)
  • ",searchSuccess.group()) #��ȡ��������ȡ�ĺ��ĵ������� + print "���壺" + for mean in means: + print "\t" + mean.decode('utf-8').encode('gbk') #������� +else: + print "δ���ҵ�����." \ No newline at end of file diff --git a/tools/SetBGPhoto.py b/tools/SetBGPhoto.py new file mode 100644 index 0000000..c823e3a --- /dev/null +++ b/tools/SetBGPhoto.py @@ -0,0 +1,21 @@ +#!/usr/bin/python +#encoding:utf-8 +import time +import os +import urllib +import re + +html = urllib.urlopen("http://cn.bing.com/").read() + +imgAddress = re.search(r"http://.*?\.jpg",html) + +if imgAddress: + fileName = "/home/geekgao/图片/BingImg/" + time.strftime("%Y-%m-%d") + ".jpg" + print "今天Bing图片的地址是:" + imgAddress.group() + print "正在下载……" + urllib.urlretrieve(imgAddress.group(), fileName) + print "下载完毕!" + "存储为" + fileName + orderStr = "gsettings set org.gnome.desktop.background picture-uri \"file:" + fileName + "\"" + os.system(orderStr) +else: + print "今天貌似出问题了……" \ No newline at end of file diff --git a/tools/downMp4OfJiKeXueYuan.py b/tools/downMp4OfJiKeXueYuan.py new file mode 100644 index 0000000..2a23e59 --- /dev/null +++ b/tools/downMp4OfJiKeXueYuan.py @@ -0,0 +1,133 @@ +# !/usr/bin/python +# coding:utf-8 + +import urllib, os, urllib2, cookielib, re + +# 下载极客学院的视频 +# 需要一个vip账号(验证邮箱和手机会有体验vip) +class DownCourse(object): + # 给urllib2添加cookie支持 + # path: 下载的视频要保存的文件夹 + def __init__(self,path): + # 初始化一个CookieJar来处理Cookie + cookieJar = cookielib.CookieJar() + # 实例化一个全局opener + opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) + # 把这个cookie处理机制装上去,大概是这个意思-.- + urllib2.install_opener(opener) + + self.folderPath = path + # 判断文件夹是否存在 + folderExists = os.path.exists(self.folderPath) + if not folderExists: + os.mkdir(self.folderPath) + + # 登陆函数 + def login(self): + # 从登录页面获取登陆参数 + login_url = 'http://passport.jikexueyuan.com/sso/login' + # 登陆信息发送到这个地址 + passport_url = 'http://passport.jikexueyuan.com/submit/login?is_ajax=1' + verifyCode_url = 'http://passport.jikexueyuan.com/sso/verify' + + # 获取登陆页面源码 + request = urllib2.urlopen(login_url) + html = request.read() + request.close() + + # 获取登陆要post的数据 + expire = re.search(r"(?s)value='(.*?)' name='expire",html) + # 验证码 + verifyCodeGifPath = '/tmp/jikexueyuan.gif' + request = urllib2.urlopen(verifyCode_url) + gif = request.read() + request.close() + fGif = open(verifyCodeGifPath,'w') + fGif.write(gif) + fGif.close() + # 读取保存到本地的验证码图片 + os.system('eog ' + verifyCodeGifPath) + verify = raw_input("请输入图中的验证码:") + + data = { + 'expire': expire.group(1), + 'referer': 'https%3A%2F%2Fround-lake.dustinice.workers.dev%3A443%2Fhttp%2Fwww.jikexueyuan.com%2F', + 'uname': 用户名, + 'password': 密码, + 'verify': verify, + } + post_data = urllib.urlencode(data) + + request = urllib2.Request(passport_url,post_data) + # 给一个useragent,防止被认为是爬虫程序 + request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36') + # 发送登录请求 + request = urllib2.urlopen(request) + request.close() + print '登陆完成' + + # courseUrl: 课程地址首页,例如:http://www.jikexueyuan.com/course/989.html + def download(self, courseUrl): + # 获取课程名称 + request = urllib2.urlopen(courseUrl) + coursePageHtml = request.read() + request.close() + courseName = re.search(r'(?s)(.*?)-',coursePageHtml).group(1) + # 课程数量 + courseCount = int(re.search(r'(?s)class="timebox"><span>(.*?)课时',coursePageHtml).group(1)) + # 存储视频的文件夹路径 + folderPath = self.folderPath + courseName + '/' + # 判断文件夹是否存在 + folderExists = os.path.exists(folderPath) + if not folderExists: + os.mkdir(folderPath) + + print '课程名:' + courseName + ' 课程数量:' + str(courseCount) + # 课程的编号,构建课程的页面地址 + i = 0 + while i < courseCount: + i += 1 + pageUrl = courseUrl.split('.html')[0] + '_' + str(i) + '.html?ss=1' + # 本节课程的html代码 + request = urllib2.urlopen(pageUrl) + pageHtml = request.read() + request.close() + # 本节课程的名称 + name = re.search(r'(?s)<title>(.*?)-',pageHtml).group(1) + # 本节课程的视频地址 + videoUrl = re.search(r'<source src="(.*?)"',pageHtml) + # 有的页面写的课时比实际课时多,会匹配不到视频地址 + if videoUrl == None: + continue + else: + videoUrl = videoUrl.group(1) + print '正在下载' + name + '...' + # 存储视频的Path: 总路径/课程名/每一节的名称 + urllib.urlretrieve(videoUrl,folderPath + str(i) + name + '.mp4',self.cbk) + print '下载完成' + + # 从网上下载的可以显示下载进度的函数 + # \b是我加的,产生了很奇特的显示效果,还行 + def cbk(self,a, b, c): + '''回调函数 + @a: 已经下载的数据块 + @b: 数据块的大小 + @c: 远程文件的大小 + ''' + per = 100.0 * a * b / c + if per > 100: + per = 100 + print '%.2f%%\b\b\b\b\b\b' % per, + +# 建立下载对象,参数是即将下载的这些视频放的目录,程序会根据课程名在这个文件夹里面再建文件夹 +down = DownCourse('/home/geekgao/视频/SpringMVC/') +down.login() + +# 下载一个页面中的所有课程 +request = urllib2.urlopen('http://www.jikexueyuan.com/course/springmvc/') +html = request.read() +request.close() +courseUrls = re.findall(r'class="lesson-info-h2"><a href="(.*?)"',html) + +for courseUrl in courseUrls: + down.download(courseUrl) \ No newline at end of file diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py" new file mode 100644 index 0000000..cf58082 --- /dev/null +++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py" @@ -0,0 +1,125 @@ +#coding=utf-8 +import urllib +import re +import threading +import time +import socket + +# 设置这么长时间超时 +socket.setdefaulttimeout(10) + +# 抓网页的地址数字 +i = 0 +# 存储线程的个数 +thirdCount = 0 +# 获取title的正则表达式 +titleRegex = re.compile(r"(?s)<title>(.*?)_") +# 获取自然语言的正则表达式(中间会有<br>,在最后写入文件之前去掉) +NLRegex = re.compile(r'(?s)<div class="pt15 f14 graydeep\s*pl20 pr20">(.*?)</div>') +# 获取大概的问题,里面会有html标签 +generalQuestionRegex = re.compile(r'(?s)<div class="graydeep User_quecol pt10 mt10" id="qdetailc"(.*?)/div>') +# 获取大概的问题中的文字,去除html标签 +accurateQuestionRegex = re.compile(r'(?s)>(.*?)<') +# 删除字符串中的空白字符 +deleteSpaceRegex = re.compile(r'\s') +# 删除<br> +deleteBrRegex = re.compile(r'<br>') + +# 处理抓取任务 +def loop(): + global i,thirdCount,titleRegex,NLRegex + i += 1 + # 表示新线程启动了 + thirdCount += 1 + + pageUrl = "http://club.xywy.com/static/1/" + str(i) + ".htm" + try: + request = urllib.urlopen(pageUrl) + except Exception, e: + # 减少一个线程 + thirdCount -= 1 + return + + try: + # 获得网页源码 + html = request.read() + except Exception, e: + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + return + + # 获取title + title = titleRegex.search(html) + # 获取自然语言 + NL = NLRegex.findall(html) + # 获取大概的问题,里面会有html标签 + generalQuestion = generalQuestionRegex.search(html) + + # 没有找到title就退出 + if title == None: + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + return + # 如果是404页面就退出 + if title.group(1).decode("gbk") == u"404页面": + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + return + print "url: " + pageUrl + " title:" + title.group(1).decode("gbk") + + # 获取大概的问题中的文字,去除html标签 + accurateQuestion = accurateQuestionRegex.findall(generalQuestion.group(1)) + + # 如果有人说的话 + if NL: + # 打开文件 + NLFile = open('/home/geekgao/data/' + repr(time.time()),'w') + # 写入文件的结果字符串(问题和回答) + result = '' + for x in accurateQuestion: + result += x + for x in NL: + result += x + # 删除空白字符 + result = deleteSpaceRegex.sub('',result) + # 删除<br> + result = deleteBrRegex.sub('',result) + + NLFile.write(result.decode("gbk").encode("utf-8")) + # 关闭文件 + NLFile.close() + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + + + +startTime = time.time() +while i < 100000: + num = i + # 线程要始终保持在50个 + if thirdCount < 50: + print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount) + t = threading.Thread(target = loop, name = str(num) + "loopThird") + t.start() + time.sleep(0.001) + +thisStartTime = time.time() +while thirdCount != 0: + # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) + if time.time() - thisStartTime > 10: + print "等待时间到,强行退出." + break + print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作" + time.sleep(0.010) +endTime = time.time() + +allTime = endTime - startTime +print "完成!花费时间:" + str(allTime) + "s" \ No newline at end of file diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md" new file mode 100644 index 0000000..5ba3215 --- /dev/null +++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md" @@ -0,0 +1,4 @@ +抓取药品时,数据有几十万,创建了一个线程池,始终容纳固定量的线程 +若某一个线程超时未完成任务,则自己退出,下一个线程进来 + +这些都是一个模式,要仿照,请参照最成熟版本yaopintong2.py \ No newline at end of file diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py" new file mode 100644 index 0000000..f0425cd --- /dev/null +++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py" @@ -0,0 +1,46 @@ +#coding=utf-8 +import urllib +import re +import threading + +i = 0 +def loop(): + global i + i += 1 + pageUrl = "http://jib.xywy.com/il_sii_" + str(i + 1) + ".htm" + request = urllib.urlopen(pageUrl) + + # 获得网页源码 + html = request.read() + # 获得title + disease = re.search(r"<title>(.*?)",html) + # 如果匹配到了title + if disease: + # 打印病名和链接 + print disease.group(1).decode("gbk").split(",")[0] + " url:" + pageUrl + # 如果是404就退出 + if re.match("^404",disease.group(1).decode("gbk").split(",")[0]): + return + # 写入文件 + f.write((disease.group(1).decode("gbk").split(",")[0] + " @f NeDisease\n").encode("utf-8")) + # 关闭请求 + request.close() + +f = open("/home/geekgao/disease1",'w') + +while i < 10136: + # 存储线程引用 + thirdList = [] + # = 线程计数 + count = 0 + # 每次同时启用100个线程 + while count < 200: + count += 1 + t = threading.Thread(target = loop, name = str(i)) + t.start() + thirdList.append(t) + for t in thirdList: + t.join() + +f.close() +print "完成" \ No newline at end of file diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py" new file mode 100644 index 0000000..7a49298 --- /dev/null +++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py" @@ -0,0 +1,88 @@ +#coding=utf-8 +# 最大453482 +import urllib +import re +import threading +import time +import socket + +# 设置这么长时间超时 +socket.setdefaulttimeout(10) + +# 抓网页的地址数字 +i = 30000 +# 存储线程的dict[序号:线程引用] +thirdDict = {} + +# 处理抓取任务 +def loop(): + global i,thirdDict + i += 1 + key = i + # 放入当前进程的引用 + thirdDict[key] = threading.current_thread() + + pageUrl = "http://yao.xywy.com/goods/" + str(i + 1) + ".htm" + try: + request = urllib.urlopen(pageUrl) + except Exception, e: + # 删除key-value + thirdDict.pop(key) + return + + try: + # 获得网页源码 + html = request.read() + except Exception, e: + # 关闭请求 + request.close() + # 删除key-value + thirdDict.pop(key) + return + + # 获得title + medicament = re.search(r"(.*)?",html) + org = re.search(r'生产企业.*?">(.*?)',html) + # 如果匹配到了title和企业信息 + if medicament and org: + # 如果是404就退出 + if medicament.group(1) == "": + print "404! url:" + pageUrl + # 关闭请求 + request.close() + # 删除key-value + thirdDict.pop(key) + return + # 打印药名和链接 + print medicament.group(1).decode("utf-8").split("(")[0] + " url:" + pageUrl + # 写入文件 + medicamentF.write((medicament.group(1).decode("utf-8").split("(")[0] + " @f NeMedicament\n").encode("utf-8")) + orgF.write((org.group(1).decode("utf-8") + " @f NeOrg\n").encode("utf-8")) + # 关闭请求 + request.close() + # 删除key-value + thirdDict.pop(key) + +medicamentF = open("/home/geekgao/medicament",'w') +orgF = open("/home/geekgao/org",'w') + +thisStartTime = time.time() +while i < 453482: + num = i + # 线程要始终保持在50个 + if len(thirdDict) < 50: + # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) + if time.time() - thisStartTime > 10: + print "等待时间到,强行退出." + break + print '新进程:' + str(num) + "loopThird" + "进程总数:" + str(len(thirdDict)) + t = threading.Thread(target = loop, name = str(num) + "loopThird") + # t = threading.Thread(target = thirdMonitor, name = str(num) + "thirdMonitor",args=(num,)) + t.start() + time.sleep(0.001) + +while len(thirdDict) != 0: + time.sleep(0.001) +medicamentF.close() +orgF.close() +print "完成" \ No newline at end of file diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py" new file mode 100644 index 0000000..9d0f715 --- /dev/null +++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py" @@ -0,0 +1,47 @@ +#coding=utf-8 +import urllib +import re +import threading + +i = 0 +def loop(): + global i + i += 1 + pageUrl = "http://zzk.xywy.com/" + str(i) + "_gaishu.html" + request = urllib.urlopen(pageUrl) + + # 获得网页源码 + html = request.read() + # 如果是404就退出 + if html == "404": + print "404! url:" + pageUrl + return + # 获得title + symptom = re.search(r"(.*?)",html) + # 如果匹配到了title + if symptom: + # 打印症状和链接 + print symptom.group(1).decode("gbk").split(u"怎么办")[0] + " url:" + pageUrl + # 写入文件 + f.write((symptom.group(1).decode("gbk").split(u"怎么办")[0] + " @f Nesymptom\n").encode("utf-8")) + # 关闭请求 + request.close() + +f = open("/home/geekgao/symptom1",'w') + +while i < 6911: + # 存储线程引用 + thirdList = [] + # = 线程计数 + count = 0 + # 每次同时启用200个线程 + while count < 200: + count += 1 + t = threading.Thread(target = loop, name = str(i)) + t.start() + thirdList.append(t) + for t in thirdList: + t.join() + +f.close() +print "完成" \ No newline at end of file diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py" new file mode 100644 index 0000000..2d7db96 --- /dev/null +++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py" @@ -0,0 +1,73 @@ +# coding=utf-8 +# 抓取药品通的网站需要的数据,这个代码是检查哪些网页不是404.存储起来,在yaopintong2.py中进行抓取 +import urllib +import re +import threading +import time +import socket + +# 设置这么长时间超时 +socket.setdefaulttimeout(8) + +# 抓网页的地址起始数字 +i = 800000 +# 存储线程的个数 +thirdCount = 0 + +# 处理抓取任务 +def loop(): + global i,thirdCount,titleRegex,NLRegex + i += 1 + # 当前网页的编号 + pageNum = i + # 表示新线程启动了 + thirdCount += 1 + + pageUrl = "http://wapypk.39.net/manual/" + str(pageNum) + try: + request = urllib.urlopen(pageUrl) + except Exception, e: + # 减少一个线程 + thirdCount -= 1 + return + + # 不正常就退出 + if request.getcode() != 200: + print "不正常的页面:" + str(pageNum) + " 返回值:" + str(request.getcode()) + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + return + print "正常的页面:" + str(pageNum) + + f.write(pageUrl + '\n') + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + +startTime = time.time() +f = open('/home/geekgao/1','a+') +while i < 830000: + num = i + 1 + # 线程要始终保持在50个 + if thirdCount < 50: + print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount) + t = threading.Thread(target = loop, name = str(num) + "loopThird") + t.start() + time.sleep(0.001) + +thisStartTime = time.time() +while thirdCount != 0: + # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) + if time.time() - thisStartTime > 10: + print "等待时间到,强行退出." + break + print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作" + time.sleep(0.010) +endTime = time.time() + +allTime = endTime - startTime +f.close() +print "完成!花费时间:" + str(allTime) + "s" \ No newline at end of file diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py" new file mode 100644 index 0000000..10190ac --- /dev/null +++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py" @@ -0,0 +1,110 @@ +# coding=utf-8 +# 抓取药品通网站的数据,这里的链接是经yaopintong.py过滤后确实可用的链接 +import urllib +import re +import threading +import time +import socket + +# 设置这么长时间超时 +socket.setdefaulttimeout(8) + +# 进程计数,存储文件计数 +i = 0 +# 存储线程的个数 +thirdCount = 0 +# 匹配药品名称 +medicamentNameRegex = re.compile(u'(?s)通用名称:(.*?)<') +# 匹配适应症状 +symptomRegex = re.compile(u'(?s)适应症:.*?

    (.*?)<') +# 匹配公司名称 +companyNameRegex = re.compile(u'(?s)企业名称:.*?

    (.*?)<') +# 匹配公司地址 +companyAddressRegex = re .compile(u'(?s)生产地址:.*?

    (.*?)<') +# 电话 +phoneNumRegex = re.compile(u'(?s)联系电话:.*?

    (.*?)<') + +# 处理抓取任务 +def loop(pageUrl): + global i,thirdCount,medicamentNameRegex,symptomRegex,companyAddressRegex,companyNameRegex + i += 1 + # 文件名用数字 + fNum = i; + # 表示新线程启动了 + thirdCount += 1 + + try: + request = urllib.urlopen(pageUrl) + except Exception, e: + # 减少一个线程 + thirdCount -= 1 + return + + try: + # 获得网页源码 + html = request.read().decode('gbk') + except Exception, e: + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + return + + # 正则匹配需要的数据 + medicamentName = medicamentNameRegex.search(html) + symptom = symptomRegex.search(html) + companyName = companyNameRegex.search(html) + companyAddress = companyAddressRegex.search(html) + phoneNum = phoneNumRegex.search(html) + + if medicamentName or symptom or companyName or companyAddress or phoneNum: + f = open('/home/geekgao/data/' + str(fNum),'w') + if medicamentName: + f.write(medicamentName.group(1).encode('utf-8') + '\n') + if symptom: + f.write(symptom.group(1).encode('utf-8') + '\n') + if companyName: + f.write(companyName.group(1).encode('utf-8') + '\n') + if companyAddress: + f.write(companyAddress.group(1).encode('utf-8') + '\n') + if phoneNum: + f.write(phoneNum.group(1).encode('utf-8') + '\n') + f.close() + print pageUrl + '抓取成功!' + else: + print pageUrl + '抓取失败!' + + # 关闭请求 + request.close() + # 减少一个线程 + thirdCount -= 1 + +startTime = time.time() +# 打开存储有需要抓取的网页链接的文件 +f = open('/home/geekgao/1','r') +while True: + num = i + 1 + # 线程要始终保持在50个 + if thirdCount <= 50: + pageUrl = f.readline() + # 读完了就退出循环 + if pageUrl == '': + break + print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount) + t = threading.Thread(target = loop, name = str(num) + " loopThird",args=(pageUrl,)) + t.start() + time.sleep(0.001) + +thisStartTime = time.time() +while thirdCount != 0: + # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) + if time.time() - thisStartTime > 10: + print "等待时间到,强行退出." + break + print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作" + time.sleep(0.010) +endTime = time.time() + +allTime = endTime - startTime +f.close() +print "完成!花费时间:" + str(allTime) + "s" \ No newline at end of file