')
+# 获取大概的问题中的文字,去除html标签
+accurateQuestionRegex = re.compile(r'(?s)>(.*?)<')
+# 删除字符串中的空白字符
+deleteSpaceRegex = re.compile(r'\s')
+# 删除
+deleteBrRegex = re.compile(r'
')
+
+# 处理抓取任务
+def loop():
+ global i,thirdCount,titleRegex,NLRegex
+ i += 1
+ # 表示新线程启动了
+ thirdCount += 1
+
+ pageUrl = "http://club.xywy.com/static/1/" + str(i) + ".htm"
+ try:
+ request = urllib.urlopen(pageUrl)
+ except Exception, e:
+ # 减少一个线程
+ thirdCount -= 1
+ return
+
+ try:
+ # 获得网页源码
+ html = request.read()
+ except Exception, e:
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+ return
+
+ # 获取title
+ title = titleRegex.search(html)
+ # 获取自然语言
+ NL = NLRegex.findall(html)
+ # 获取大概的问题,里面会有html标签
+ generalQuestion = generalQuestionRegex.search(html)
+
+ # 没有找到title就退出
+ if title == None:
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+ return
+ # 如果是404页面就退出
+ if title.group(1).decode("gbk") == u"404页面":
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+ return
+ print "url: " + pageUrl + " title:" + title.group(1).decode("gbk")
+
+ # 获取大概的问题中的文字,去除html标签
+ accurateQuestion = accurateQuestionRegex.findall(generalQuestion.group(1))
+
+ # 如果有人说的话
+ if NL:
+ # 打开文件
+ NLFile = open('/home/geekgao/data/' + repr(time.time()),'w')
+ # 写入文件的结果字符串(问题和回答)
+ result = ''
+ for x in accurateQuestion:
+ result += x
+ for x in NL:
+ result += x
+ # 删除空白字符
+ result = deleteSpaceRegex.sub('',result)
+ # 删除
+ result = deleteBrRegex.sub('',result)
+
+ NLFile.write(result.decode("gbk").encode("utf-8"))
+ # 关闭文件
+ NLFile.close()
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+
+
+
+startTime = time.time()
+while i < 100000:
+ num = i
+ # 线程要始终保持在50个
+ if thirdCount < 50:
+ print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount)
+ t = threading.Thread(target = loop, name = str(num) + "loopThird")
+ t.start()
+ time.sleep(0.001)
+
+thisStartTime = time.time()
+while thirdCount != 0:
+ # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”)
+ if time.time() - thisStartTime > 10:
+ print "等待时间到,强行退出."
+ break
+ print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作"
+ time.sleep(0.010)
+endTime = time.time()
+
+allTime = endTime - startTime
+print "完成!花费时间:" + str(allTime) + "s"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md"
new file mode 100644
index 0000000..5ba3215
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md"
@@ -0,0 +1,4 @@
+抓取药品时,数据有几十万,创建了一个线程池,始终容纳固定量的线程
+若某一个线程超时未完成任务,则自己退出,下一个线程进来
+
+这些都是一个模式,要仿照,请参照最成熟版本yaopintong2.py
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py"
new file mode 100644
index 0000000..f0425cd
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py"
@@ -0,0 +1,46 @@
+#coding=utf-8
+import urllib
+import re
+import threading
+
+i = 0
+def loop():
+ global i
+ i += 1
+ pageUrl = "http://jib.xywy.com/il_sii_" + str(i + 1) + ".htm"
+ request = urllib.urlopen(pageUrl)
+
+ # 获得网页源码
+ html = request.read()
+ # 获得title
+ disease = re.search(r"
(.*?)",html)
+ # 如果匹配到了title
+ if disease:
+ # 打印病名和链接
+ print disease.group(1).decode("gbk").split(",")[0] + " url:" + pageUrl
+ # 如果是404就退出
+ if re.match("^404",disease.group(1).decode("gbk").split(",")[0]):
+ return
+ # 写入文件
+ f.write((disease.group(1).decode("gbk").split(",")[0] + " @f NeDisease\n").encode("utf-8"))
+ # 关闭请求
+ request.close()
+
+f = open("/home/geekgao/disease1",'w')
+
+while i < 10136:
+ # 存储线程引用
+ thirdList = []
+ # = 线程计数
+ count = 0
+ # 每次同时启用100个线程
+ while count < 200:
+ count += 1
+ t = threading.Thread(target = loop, name = str(i))
+ t.start()
+ thirdList.append(t)
+ for t in thirdList:
+ t.join()
+
+f.close()
+print "完成"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py"
new file mode 100644
index 0000000..7a49298
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py"
@@ -0,0 +1,88 @@
+#coding=utf-8
+# 最大453482
+import urllib
+import re
+import threading
+import time
+import socket
+
+# 设置这么长时间超时
+socket.setdefaulttimeout(10)
+
+# 抓网页的地址数字
+i = 30000
+# 存储线程的dict[序号:线程引用]
+thirdDict = {}
+
+# 处理抓取任务
+def loop():
+ global i,thirdDict
+ i += 1
+ key = i
+ # 放入当前进程的引用
+ thirdDict[key] = threading.current_thread()
+
+ pageUrl = "http://yao.xywy.com/goods/" + str(i + 1) + ".htm"
+ try:
+ request = urllib.urlopen(pageUrl)
+ except Exception, e:
+ # 删除key-value
+ thirdDict.pop(key)
+ return
+
+ try:
+ # 获得网页源码
+ html = request.read()
+ except Exception, e:
+ # 关闭请求
+ request.close()
+ # 删除key-value
+ thirdDict.pop(key)
+ return
+
+ # 获得title
+ medicament = re.search(r"
(.*)?",html)
+ org = re.search(r'生产企业.*?">(.*?)',html)
+ # 如果匹配到了title和企业信息
+ if medicament and org:
+ # 如果是404就退出
+ if medicament.group(1) == "":
+ print "404! url:" + pageUrl
+ # 关闭请求
+ request.close()
+ # 删除key-value
+ thirdDict.pop(key)
+ return
+ # 打印药名和链接
+ print medicament.group(1).decode("utf-8").split("(")[0] + " url:" + pageUrl
+ # 写入文件
+ medicamentF.write((medicament.group(1).decode("utf-8").split("(")[0] + " @f NeMedicament\n").encode("utf-8"))
+ orgF.write((org.group(1).decode("utf-8") + " @f NeOrg\n").encode("utf-8"))
+ # 关闭请求
+ request.close()
+ # 删除key-value
+ thirdDict.pop(key)
+
+medicamentF = open("/home/geekgao/medicament",'w')
+orgF = open("/home/geekgao/org",'w')
+
+thisStartTime = time.time()
+while i < 453482:
+ num = i
+ # 线程要始终保持在50个
+ if len(thirdDict) < 50:
+ # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”)
+ if time.time() - thisStartTime > 10:
+ print "等待时间到,强行退出."
+ break
+ print '新进程:' + str(num) + "loopThird" + "进程总数:" + str(len(thirdDict))
+ t = threading.Thread(target = loop, name = str(num) + "loopThird")
+ # t = threading.Thread(target = thirdMonitor, name = str(num) + "thirdMonitor",args=(num,))
+ t.start()
+ time.sleep(0.001)
+
+while len(thirdDict) != 0:
+ time.sleep(0.001)
+medicamentF.close()
+orgF.close()
+print "完成"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py"
new file mode 100644
index 0000000..9d0f715
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py"
@@ -0,0 +1,47 @@
+#coding=utf-8
+import urllib
+import re
+import threading
+
+i = 0
+def loop():
+ global i
+ i += 1
+ pageUrl = "http://zzk.xywy.com/" + str(i) + "_gaishu.html"
+ request = urllib.urlopen(pageUrl)
+
+ # 获得网页源码
+ html = request.read()
+ # 如果是404就退出
+ if html == "404":
+ print "404! url:" + pageUrl
+ return
+ # 获得title
+ symptom = re.search(r"
(.*?)",html)
+ # 如果匹配到了title
+ if symptom:
+ # 打印症状和链接
+ print symptom.group(1).decode("gbk").split(u"怎么办")[0] + " url:" + pageUrl
+ # 写入文件
+ f.write((symptom.group(1).decode("gbk").split(u"怎么办")[0] + " @f Nesymptom\n").encode("utf-8"))
+ # 关闭请求
+ request.close()
+
+f = open("/home/geekgao/symptom1",'w')
+
+while i < 6911:
+ # 存储线程引用
+ thirdList = []
+ # = 线程计数
+ count = 0
+ # 每次同时启用200个线程
+ while count < 200:
+ count += 1
+ t = threading.Thread(target = loop, name = str(i))
+ t.start()
+ thirdList.append(t)
+ for t in thirdList:
+ t.join()
+
+f.close()
+print "完成"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py"
new file mode 100644
index 0000000..2d7db96
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py"
@@ -0,0 +1,73 @@
+# coding=utf-8
+# 抓取药品通的网站需要的数据,这个代码是检查哪些网页不是404.存储起来,在yaopintong2.py中进行抓取
+import urllib
+import re
+import threading
+import time
+import socket
+
+# 设置这么长时间超时
+socket.setdefaulttimeout(8)
+
+# 抓网页的地址起始数字
+i = 800000
+# 存储线程的个数
+thirdCount = 0
+
+# 处理抓取任务
+def loop():
+ global i,thirdCount,titleRegex,NLRegex
+ i += 1
+ # 当前网页的编号
+ pageNum = i
+ # 表示新线程启动了
+ thirdCount += 1
+
+ pageUrl = "http://wapypk.39.net/manual/" + str(pageNum)
+ try:
+ request = urllib.urlopen(pageUrl)
+ except Exception, e:
+ # 减少一个线程
+ thirdCount -= 1
+ return
+
+ # 不正常就退出
+ if request.getcode() != 200:
+ print "不正常的页面:" + str(pageNum) + " 返回值:" + str(request.getcode())
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+ return
+ print "正常的页面:" + str(pageNum)
+
+ f.write(pageUrl + '\n')
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+
+startTime = time.time()
+f = open('/home/geekgao/1','a+')
+while i < 830000:
+ num = i + 1
+ # 线程要始终保持在50个
+ if thirdCount < 50:
+ print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount)
+ t = threading.Thread(target = loop, name = str(num) + "loopThird")
+ t.start()
+ time.sleep(0.001)
+
+thisStartTime = time.time()
+while thirdCount != 0:
+ # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”)
+ if time.time() - thisStartTime > 10:
+ print "等待时间到,强行退出."
+ break
+ print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作"
+ time.sleep(0.010)
+endTime = time.time()
+
+allTime = endTime - startTime
+f.close()
+print "完成!花费时间:" + str(allTime) + "s"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py"
new file mode 100644
index 0000000..10190ac
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py"
@@ -0,0 +1,110 @@
+# coding=utf-8
+# 抓取药品通网站的数据,这里的链接是经yaopintong.py过滤后确实可用的链接
+import urllib
+import re
+import threading
+import time
+import socket
+
+# 设置这么长时间超时
+socket.setdefaulttimeout(8)
+
+# 进程计数,存储文件计数
+i = 0
+# 存储线程的个数
+thirdCount = 0
+# 匹配药品名称
+medicamentNameRegex = re.compile(u'(?s)通用名称:(.*?)<')
+# 匹配适应症状
+symptomRegex = re.compile(u'(?s)适应症:.*?
(.*?)<')
+# 匹配公司名称
+companyNameRegex = re.compile(u'(?s)企业名称:.*?
(.*?)<')
+# 匹配公司地址
+companyAddressRegex = re .compile(u'(?s)生产地址:.*?
(.*?)<')
+# 电话
+phoneNumRegex = re.compile(u'(?s)联系电话:.*?
(.*?)<')
+
+# 处理抓取任务
+def loop(pageUrl):
+ global i,thirdCount,medicamentNameRegex,symptomRegex,companyAddressRegex,companyNameRegex
+ i += 1
+ # 文件名用数字
+ fNum = i;
+ # 表示新线程启动了
+ thirdCount += 1
+
+ try:
+ request = urllib.urlopen(pageUrl)
+ except Exception, e:
+ # 减少一个线程
+ thirdCount -= 1
+ return
+
+ try:
+ # 获得网页源码
+ html = request.read().decode('gbk')
+ except Exception, e:
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+ return
+
+ # 正则匹配需要的数据
+ medicamentName = medicamentNameRegex.search(html)
+ symptom = symptomRegex.search(html)
+ companyName = companyNameRegex.search(html)
+ companyAddress = companyAddressRegex.search(html)
+ phoneNum = phoneNumRegex.search(html)
+
+ if medicamentName or symptom or companyName or companyAddress or phoneNum:
+ f = open('/home/geekgao/data/' + str(fNum),'w')
+ if medicamentName:
+ f.write(medicamentName.group(1).encode('utf-8') + '\n')
+ if symptom:
+ f.write(symptom.group(1).encode('utf-8') + '\n')
+ if companyName:
+ f.write(companyName.group(1).encode('utf-8') + '\n')
+ if companyAddress:
+ f.write(companyAddress.group(1).encode('utf-8') + '\n')
+ if phoneNum:
+ f.write(phoneNum.group(1).encode('utf-8') + '\n')
+ f.close()
+ print pageUrl + '抓取成功!'
+ else:
+ print pageUrl + '抓取失败!'
+
+ # 关闭请求
+ request.close()
+ # 减少一个线程
+ thirdCount -= 1
+
+startTime = time.time()
+# 打开存储有需要抓取的网页链接的文件
+f = open('/home/geekgao/1','r')
+while True:
+ num = i + 1
+ # 线程要始终保持在50个
+ if thirdCount <= 50:
+ pageUrl = f.readline()
+ # 读完了就退出循环
+ if pageUrl == '':
+ break
+ print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount)
+ t = threading.Thread(target = loop, name = str(num) + " loopThird",args=(pageUrl,))
+ t.start()
+ time.sleep(0.001)
+
+thisStartTime = time.time()
+while thirdCount != 0:
+ # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”)
+ if time.time() - thisStartTime > 10:
+ print "等待时间到,强行退出."
+ break
+ print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作"
+ time.sleep(0.010)
+endTime = time.time()
+
+allTime = endTime - startTime
+f.close()
+print "完成!花费时间:" + str(allTime) + "s"
\ No newline at end of file