From 040025845c60ccffa4620bbd3d3391f4548273a1 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 13 Jul 2016 21:57:48 +0800
Subject: [PATCH] =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=B1=BB=20=E6=9F=A5?=
 =?UTF-8?q?=E5=AD=97=E5=85=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/Dict.py                                 |  28 ++++
 tools/Dict.window.py                          |  28 ++++
 tools/SetBGPhoto.py                           |  21 +++
 tools/downMp4OfJiKeXueYuan.py                 | 133 ++++++++++++++++++
 .../NL.py"                                    | 125 ++++++++++++++++
 .../Readme.md"                                |   4 +
 .../disease.py"                               |  46 ++++++
 .../medicamentANDorgThird.py"                 |  88 ++++++++++++
 .../symptom.py"                               |  47 +++++++
 .../yaopintong.py"                            |  73 ++++++++++
 .../yaopintong2.py"                           | 110 +++++++++++++++
 11 files changed, 703 insertions(+)
 create mode 100644 tools/Dict.py
 create mode 100644 tools/Dict.window.py
 create mode 100644 tools/SetBGPhoto.py
 create mode 100644 tools/downMp4OfJiKeXueYuan.py
 create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py"
 create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md"
 create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py"
 create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py"
 create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py"
 create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py"
 create mode 100644 "tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py"
diff --git a/tools/Dict.py b/tools/Dict.py
new file mode 100644
index 0000000..8191007
--- /dev/null
+++ b/tools/Dict.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+#coding:utf-8
+import urllib
+import sys
+import re
+
+if len(sys.argv) == 1:	#没有单词就提示用法
+	print "用法:./Dict.py 要查找的单词"
+	sys.exit()
+
+word = ""
+for x in range(len(sys.argv) - 1): #查找的可能是短语，中间有空格，如"join in",这里拼接单词
+	word += " " + sys.argv[x + 1]
+print "单词：" + word
+
+searchUrl = "http://dict.youdao.com/search?q=" + word + "&keyfrom=dict.index"	#查找的地址
+response = urllib.urlopen(searchUrl).read() #获得查找到的网页源码
+
+#从网页源码提取出单词释义那一部分
+searchSuccess = re.search(r"(?s)<div class=\"trans-container\">\s*<ul>.*?</div>",response)
+
+if searchSuccess:
+	means = re.findall(r"(?m)<li>(.*?)</li>",searchSuccess.group()) #获取我们想提取的核心单词释义
+	print "释义："
+	for mean in means:
+		print "\t" + mean	#输出释义
+else:
+	print "未查找到释义."
\ No newline at end of file
diff --git a/tools/Dict.window.py b/tools/Dict.window.py
new file mode 100644
index 0000000..5badd57
--- /dev/null
+++ b/tools/Dict.window.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+#coding:utf-8
+import urllib
+import sys
+import re
+
+if len(sys.argv) == 1:	#û�е��ʾ���ʾ�÷�
+	print "�÷�:./Dict.py Ҫ���ҵĵ���"
+	sys.exit()
+
+word = ""
+for x in range(len(sys.argv) - 1): #���ҵĿ����Ƕ���м��пո���"join in",����ƴ�ӵ���
+	word += " " + sys.argv[x + 1]
+print "���ʣ�" + word
+
+searchUrl = "http://dict.youdao.com/search?q=" + word + "&keyfrom=dict.index"	#���ҵĵ�ַ
+response = urllib.urlopen(searchUrl).read() #��ò��ҵ�����ҳԴ��
+
+#����ҳԴ����ȡ������������һ����
+searchSuccess = re.search(r"(?s)<div class=\"trans-container\">\s*<ul>.*?</div>",response)
+
+if searchSuccess:
+	means = re.findall(r"(?m)<li>(.*?)</li>",searchSuccess.group()) #��ȡ��������ȡ�ĺ��ĵ�������
+	print "���壺"
+	for mean in means:
+		print "\t" + mean.decode('utf-8').encode('gbk')	#�������
+else:
+	print "δ���ҵ�����."
\ No newline at end of file
diff --git a/tools/SetBGPhoto.py b/tools/SetBGPhoto.py
new file mode 100644
index 0000000..c823e3a
--- /dev/null
+++ b/tools/SetBGPhoto.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python
+#encoding:utf-8
+import time
+import os
+import urllib
+import re
+
+html = urllib.urlopen("http://cn.bing.com/").read()
+
+imgAddress = re.search(r"http://.*?\.jpg",html)
+
+if imgAddress:
+	fileName = "/home/geekgao/图片/BingImg/" + time.strftime("%Y-%m-%d") + ".jpg"
+	print "今天Bing图片的地址是:" + imgAddress.group()
+	print "正在下载……"
+	urllib.urlretrieve(imgAddress.group(), fileName)
+	print "下载完毕!" + "存储为" + fileName
+	orderStr = "gsettings set org.gnome.desktop.background picture-uri \"file:" + fileName + "\""
+	os.system(orderStr)
+else:
+	print "今天貌似出问题了……"
\ No newline at end of file
diff --git a/tools/downMp4OfJiKeXueYuan.py b/tools/downMp4OfJiKeXueYuan.py
new file mode 100644
index 0000000..2a23e59
--- /dev/null
+++ b/tools/downMp4OfJiKeXueYuan.py
@@ -0,0 +1,133 @@
+# !/usr/bin/python
+# coding:utf-8
+
+import urllib, os, urllib2, cookielib, re
+
+# 下载极客学院的视频
+# 需要一个vip账号(验证邮箱和手机会有体验vip)
+class DownCourse(object):
+	# 给urllib2添加cookie支持
+	# path: 下载的视频要保存的文件夹
+	def __init__(self,path):
+		# 初始化一个CookieJar来处理Cookie
+		cookieJar = cookielib.CookieJar()
+		# 实例化一个全局opener
+		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
+		# 把这个cookie处理机制装上去,大概是这个意思-.-
+		urllib2.install_opener(opener)
+		
+		self.folderPath = path
+		# 判断文件夹是否存在
+		folderExists = os.path.exists(self.folderPath)
+		if not folderExists:
+			os.mkdir(self.folderPath)
+		
+	# 登陆函数
+	def login(self):
+		# 从登录页面获取登陆参数
+		login_url = 'http://passport.jikexueyuan.com/sso/login'
+		# 登陆信息发送到这个地址
+		passport_url = 'http://passport.jikexueyuan.com/submit/login?is_ajax=1'
+		verifyCode_url = 'http://passport.jikexueyuan.com/sso/verify'
+		
+		# 获取登陆页面源码
+		request = urllib2.urlopen(login_url)
+		html = request.read()
+		request.close()
+		
+		# 获取登陆要post的数据
+		expire = re.search(r"(?s)value='(.*?)' name='expire",html)
+		# 验证码
+		verifyCodeGifPath = '/tmp/jikexueyuan.gif'
+		request = urllib2.urlopen(verifyCode_url)
+		gif = request.read()
+		request.close()
+		fGif = open(verifyCodeGifPath,'w')
+		fGif.write(gif)
+		fGif.close()
+		# 读取保存到本地的验证码图片
+		os.system('eog ' + verifyCodeGifPath)
+		verify = raw_input("请输入图中的验证码:")
+		
+		data = {
+			'expire': expire.group(1),
+			'referer': 'https%3A%2F%2Fround-lake.dustinice.workers.dev%3A443%2Fhttp%2Fwww.jikexueyuan.com%2F',
+			'uname': 用户名,
+			'password': 密码,
+			'verify': verify,
+		}
+		post_data = urllib.urlencode(data)
+		
+		request = urllib2.Request(passport_url,post_data)
+		# 给一个useragent,防止被认为是爬虫程序
+		request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36')
+		# 发送登录请求
+		request = urllib2.urlopen(request)
+		request.close()
+		print '登陆完成'
+
+	# courseUrl: 课程地址首页,例如:http://www.jikexueyuan.com/course/989.html
+	def download(self, courseUrl):
+		# 获取课程名称
+		request = urllib2.urlopen(courseUrl)
+		coursePageHtml = request.read()
+		request.close()
+		courseName = re.search(r'(?s)<title>(.*?)-',coursePageHtml).group(1)
+		# 课程数量
+		courseCount = int(re.search(r'(?s)class="timebox"><span>(.*?)课时',coursePageHtml).group(1))
+		# 存储视频的文件夹路径
+		folderPath = self.folderPath + courseName + '/'
+		# 判断文件夹是否存在
+		folderExists = os.path.exists(folderPath)
+		if not folderExists:
+			os.mkdir(folderPath)
+		
+		print '课程名:' + courseName + ' 课程数量:' + str(courseCount)
+		# 课程的编号,构建课程的页面地址
+		i = 0
+		while i < courseCount:
+			i += 1
+			pageUrl = courseUrl.split('.html')[0] + '_' + str(i) + '.html?ss=1'
+			# 本节课程的html代码
+			request = urllib2.urlopen(pageUrl)
+			pageHtml = request.read()
+			request.close()
+			# 本节课程的名称
+			name = re.search(r'(?s)<title>(.*?)-',pageHtml).group(1)
+			# 本节课程的视频地址
+			videoUrl = re.search(r'<source src="(.*?)"',pageHtml)
+			# 有的页面写的课时比实际课时多,会匹配不到视频地址
+			if videoUrl == None:
+				continue
+			else:
+				videoUrl = videoUrl.group(1)
+			print '正在下载' + name + '...'
+			# 存储视频的Path: 总路径/课程名/每一节的名称
+			urllib.urlretrieve(videoUrl,folderPath + str(i) + name + '.mp4',self.cbk)
+		print '下载完成'
+	
+	# 从网上下载的可以显示下载进度的函数
+	# \b是我加的,产生了很奇特的显示效果,还行
+	def cbk(self,a, b, c): 
+	    '''回调函数
+	    @a: 已经下载的数据块
+	    @b: 数据块的大小
+	    @c: 远程文件的大小
+	    ''' 
+	    per = 100.0 * a * b / c 
+	    if per > 100: 
+	        per = 100 
+	    print '%.2f%%\b\b\b\b\b\b' % per,
+		
+# 建立下载对象,参数是即将下载的这些视频放的目录,程序会根据课程名在这个文件夹里面再建文件夹
+down = DownCourse('/home/geekgao/视频/SpringMVC/')
+down.login()
+
+# 下载一个页面中的所有课程
+request = urllib2.urlopen('http://www.jikexueyuan.com/course/springmvc/')
+html = request.read()
+request.close()
+courseUrls = re.findall(r'class="lesson-info-h2"><a href="(.*?)"',html)
+
+for courseUrl in courseUrls:
+	down.download(courseUrl)
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py"
new file mode 100644
index 0000000..cf58082
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/NL.py"
@@ -0,0 +1,125 @@
+#coding=utf-8
+import urllib
+import re
+import threading
+import time
+import socket
+
+# 设置这么长时间超时
+socket.setdefaulttimeout(10)
+
+# 抓网页的地址数字
+i = 0
+# 存储线程的个数
+thirdCount = 0
+# 获取title的正则表达式
+titleRegex = re.compile(r"(?s)<title>(.*?)_")
+# 获取自然语言的正则表达式（中间会有<br>，在最后写入文件之前去掉）
+NLRegex = re.compile(r'(?s)<div class="pt15 f14 graydeep\s*pl20 pr20">(.*?)</div>')
+# 获取大概的问题，里面会有html标签
+generalQuestionRegex = re.compile(r'(?s)<div class="graydeep User_quecol pt10 mt10" id="qdetailc"(.*?)/div>')
+# 获取大概的问题中的文字，去除html标签
+accurateQuestionRegex = re.compile(r'(?s)>(.*?)<')
+# 删除字符串中的空白字符
+deleteSpaceRegex = re.compile(r'\s')
+# 删除<br>
+deleteBrRegex = re.compile(r'<br>')
+
+# 处理抓取任务
+def loop():
+	global i,thirdCount,titleRegex,NLRegex
+	i += 1
+	# 表示新线程启动了
+	thirdCount += 1
+	
+	pageUrl = "http://club.xywy.com/static/1/" + str(i) + ".htm"
+	try:
+		request = urllib.urlopen(pageUrl)
+	except Exception, e:
+		# 减少一个线程
+		thirdCount -= 1
+		return
+	
+	try:
+		# 获得网页源码
+		html = request.read()
+	except Exception, e:
+		# 关闭请求
+		request.close()
+		# 减少一个线程
+		thirdCount -= 1
+		return
+		
+	# 获取title
+	title = titleRegex.search(html)
+	# 获取自然语言
+	NL = NLRegex.findall(html)
+	# 获取大概的问题，里面会有html标签
+	generalQuestion = generalQuestionRegex.search(html)
+	
+	# 没有找到title就退出
+	if title == None:
+		# 关闭请求
+		request.close()
+		# 减少一个线程
+		thirdCount -= 1
+		return
+	# 如果是404页面就退出
+	if title.group(1).decode("gbk") == u"404页面":
+		# 关闭请求
+		request.close()
+		# 减少一个线程
+		thirdCount -= 1
+		return
+	print "url: " + pageUrl + " title:" + title.group(1).decode("gbk")
+	
+	# 获取大概的问题中的文字，去除html标签
+	accurateQuestion = accurateQuestionRegex.findall(generalQuestion.group(1))
+
+	# 如果有人说的话
+	if NL:
+		# 打开文件
+		NLFile = open('/home/geekgao/data/' + repr(time.time()),'w')
+		# 写入文件的结果字符串（问题和回答）
+		result = ''
+		for x in accurateQuestion:
+			result += x
+		for x in NL:
+			result += x
+		# 删除空白字符
+		result = deleteSpaceRegex.sub('',result)
+		# 删除<br>
+		result = deleteBrRegex.sub('',result)
+		
+		NLFile.write(result.decode("gbk").encode("utf-8"))
+		# 关闭文件
+		NLFile.close()
+	# 关闭请求
+	request.close()
+	# 减少一个线程
+	thirdCount -= 1
+	
+
+
+startTime = time.time()
+while i < 100000:
+	num = i
+	# 线程要始终保持在50个
+	if thirdCount < 50:
+		print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount)
+		t = threading.Thread(target = loop, name = str(num) + "loopThird")
+		t.start()
+	time.sleep(0.001)
+
+thisStartTime = time.time()
+while thirdCount != 0:
+	# 等待超时就退出（没有这个有时候线程并不能全部退出，看资源管理器，说“等候频道 poll_scheme_time”）
+	if time.time() - thisStartTime > 10:
+		print "等待时间到,强行退出."
+		break
+	print "等待线程全部结束！还有" + str(thirdCount) + "个线程在工作"
+	time.sleep(0.010)
+endTime = time.time()
+
+allTime = endTime - startTime
+print "完成!花费时间:" + str(allTime) + "s"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md"
new file mode 100644
index 0000000..5ba3215
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/Readme.md"
@@ -0,0 +1,4 @@
+抓取药品时，数据有几十万，创建了一个线程池，始终容纳固定量的线程
+若某一个线程超时未完成任务，则自己退出，下一个线程进来
+
+这些都是一个模式,要仿照,请参照最成熟版本yaopintong2.py
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py"
new file mode 100644
index 0000000..f0425cd
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/disease.py"
@@ -0,0 +1,46 @@
+#coding=utf-8
+import urllib
+import re
+import threading
+
+i = 0
+def loop():
+	global i
+	i += 1
+	pageUrl = "http://jib.xywy.com/il_sii_" + str(i + 1) + ".htm"
+	request = urllib.urlopen(pageUrl)
+
+	# 获得网页源码
+	html = request.read()
+	# 获得title
+	disease = re.search(r"<title>(.*?)</title>",html)
+	# 如果匹配到了title
+	if disease:
+		# 打印病名和链接
+		print disease.group(1).decode("gbk").split(",")[0] + " url:" + pageUrl
+		# 如果是404就退出
+		if re.match("^404",disease.group(1).decode("gbk").split(",")[0]):
+			return
+		# 写入文件
+		f.write((disease.group(1).decode("gbk").split(",")[0] + " @f NeDisease\n").encode("utf-8"))
+	# 关闭请求
+	request.close()
+	
+f = open("/home/geekgao/disease1",'w')
+
+while i < 10136:
+	# 存储线程引用
+	thirdList = []
+	# = 线程计数
+	count = 0
+	# 每次同时启用100个线程
+	while count < 200:
+		count += 1
+		t = threading.Thread(target = loop, name = str(i))
+		t.start()
+		thirdList.append(t)
+	for t in thirdList:
+		t.join()
+
+f.close()
+print "完成"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py"
new file mode 100644
index 0000000..7a49298
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/medicamentANDorgThird.py"
@@ -0,0 +1,88 @@
+#coding=utf-8
+# 最大453482
+import urllib
+import re
+import threading
+import time
+import socket
+
+# 设置这么长时间超时
+socket.setdefaulttimeout(10)
+
+# 抓网页的地址数字
+i = 30000
+# 存储线程的dict[序号:线程引用]
+thirdDict = {}
+
+# 处理抓取任务
+def loop():
+	global i,thirdDict
+	i += 1
+	key = i
+	# 放入当前进程的引用
+	thirdDict[key] = threading.current_thread()
+	
+	pageUrl = "http://yao.xywy.com/goods/" + str(i + 1) + ".htm"
+	try:
+		request = urllib.urlopen(pageUrl)
+	except Exception, e:
+		# 删除key-value
+		thirdDict.pop(key)
+		return
+	
+	try:
+		# 获得网页源码
+		html = request.read()
+	except Exception, e:
+		# 关闭请求
+		request.close()
+		# 删除key-value
+		thirdDict.pop(key)
+		return
+		
+	# 获得title
+	medicament = re.search(r"<title>(.*)?</title>",html)
+	org = re.search(r'生产企业.*?">(.*?)</a>',html)
+	# 如果匹配到了title和企业信息
+	if medicament and org:
+		# 如果是404就退出
+		if medicament.group(1) == "":
+			print "404! url:" + pageUrl
+			# 关闭请求
+			request.close()
+			# 删除key-value
+			thirdDict.pop(key)
+			return
+		# 打印药名和链接
+		print medicament.group(1).decode("utf-8").split("(")[0] + " url:" + pageUrl
+		# 写入文件
+		medicamentF.write((medicament.group(1).decode("utf-8").split("(")[0] + " @f NeMedicament\n").encode("utf-8"))
+		orgF.write((org.group(1).decode("utf-8") + " @f NeOrg\n").encode("utf-8"))
+	# 关闭请求
+	request.close()
+	# 删除key-value
+	thirdDict.pop(key)
+	
+medicamentF = open("/home/geekgao/medicament",'w')
+orgF = open("/home/geekgao/org",'w')
+
+thisStartTime = time.time()
+while i < 453482:
+	num = i
+	# 线程要始终保持在50个
+	if len(thirdDict) < 50:
+		# 等待超时就退出（没有这个有时候线程并不能全部退出，看资源管理器，说“等候频道 poll_scheme_time”）
+		if time.time() - thisStartTime > 10:
+			print "等待时间到,强行退出."
+			break
+		print '新进程:' + str(num) + "loopThird" + "进程总数:" + str(len(thirdDict))
+		t = threading.Thread(target = loop, name = str(num) + "loopThird")
+		# t = threading.Thread(target = thirdMonitor, name = str(num) + "thirdMonitor",args=(num,))
+		t.start()
+	time.sleep(0.001)
+
+while len(thirdDict) != 0:
+	time.sleep(0.001)
+medicamentF.close()
+orgF.close()
+print "完成"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py"
new file mode 100644
index 0000000..9d0f715
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/symptom.py"
@@ -0,0 +1,47 @@
+#coding=utf-8
+import urllib
+import re
+import threading
+
+i = 0
+def loop():
+	global i
+	i += 1
+	pageUrl = "http://zzk.xywy.com/" + str(i) + "_gaishu.html"
+	request = urllib.urlopen(pageUrl)
+
+	# 获得网页源码
+	html = request.read()
+	# 如果是404就退出
+	if html == "404":
+		print "404! url:" + pageUrl
+		return
+	# 获得title
+	symptom = re.search(r"<title>(.*?)</title>",html)
+	# 如果匹配到了title
+	if symptom:
+		# 打印症状和链接
+		print symptom.group(1).decode("gbk").split(u"怎么办")[0] + " url:" + pageUrl
+		# 写入文件
+		f.write((symptom.group(1).decode("gbk").split(u"怎么办")[0] + " @f Nesymptom\n").encode("utf-8"))
+	# 关闭请求
+	request.close()
+	
+f = open("/home/geekgao/symptom1",'w')
+
+while i < 6911:
+	# 存储线程引用
+	thirdList = []
+	# = 线程计数
+	count = 0
+	# 每次同时启用200个线程
+	while count < 200:
+		count += 1
+		t = threading.Thread(target = loop, name = str(i))
+		t.start()
+		thirdList.append(t)
+	for t in thirdList:
+		t.join()
+
+f.close()
+print "完成"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py"
new file mode 100644
index 0000000..2d7db96
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong.py"
@@ -0,0 +1,73 @@
+# coding=utf-8
+# 抓取药品通的网站需要的数据，这个代码是检查哪些网页不是404.存储起来，在yaopintong2.py中进行抓取
+import urllib
+import re
+import threading
+import time
+import socket
+
+# 设置这么长时间超时
+socket.setdefaulttimeout(8)
+
+# 抓网页的地址起始数字
+i = 800000
+# 存储线程的个数
+thirdCount = 0
+
+# 处理抓取任务
+def loop():
+	global i,thirdCount,titleRegex,NLRegex
+	i += 1
+	# 当前网页的编号
+	pageNum = i
+	# 表示新线程启动了
+	thirdCount += 1
+	
+	pageUrl = "http://wapypk.39.net/manual/" + str(pageNum)
+	try:
+		request = urllib.urlopen(pageUrl)
+	except Exception, e:
+		# 减少一个线程
+		thirdCount -= 1
+		return
+	
+	# 不正常就退出
+	if request.getcode() != 200:
+		print "不正常的页面:" + str(pageNum) + " 返回值:" + str(request.getcode())
+		# 关闭请求
+		request.close()
+		# 减少一个线程
+		thirdCount -= 1
+		return
+	print "正常的页面:" + str(pageNum)
+	
+	f.write(pageUrl + '\n')
+	# 关闭请求
+	request.close()
+	# 减少一个线程
+	thirdCount -= 1
+	
+startTime = time.time()
+f = open('/home/geekgao/1','a+')
+while i < 830000:
+	num = i + 1
+	# 线程要始终保持在50个
+	if thirdCount < 50:
+		print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount)
+		t = threading.Thread(target = loop, name = str(num) + "loopThird")
+		t.start()
+	time.sleep(0.001)
+
+thisStartTime = time.time()
+while thirdCount != 0:
+	# 等待超时就退出（没有这个有时候线程并不能全部退出，看资源管理器，说“等候频道 poll_scheme_time”）
+	if time.time() - thisStartTime > 10:
+		print "等待时间到,强行退出."
+		break
+	print "等待线程全部结束！还有" + str(thirdCount) + "个线程在工作"
+	time.sleep(0.010)
+endTime = time.time()
+
+allTime = endTime - startTime
+f.close()
+print "完成!花费时间:" + str(allTime) + "s"
\ No newline at end of file
diff --git "a/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py" "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py"
new file mode 100644
index 0000000..10190ac
--- /dev/null
+++ "b/tools/\346\212\223\350\215\257\345\223\201\346\225\260\346\215\256/yaopintong2.py"
@@ -0,0 +1,110 @@
+# coding=utf-8
+# 抓取药品通网站的数据,这里的链接是经yaopintong.py过滤后确实可用的链接
+import urllib
+import re
+import threading
+import time
+import socket
+
+# 设置这么长时间超时
+socket.setdefaulttimeout(8)
+
+# 进程计数，存储文件计数
+i = 0
+# 存储线程的个数
+thirdCount = 0
+# 匹配药品名称
+medicamentNameRegex = re.compile(u'(?s)通用名称：(.*?)<')
+# 匹配适应症状
+symptomRegex = re.compile(u'(?s)适应症：.*?<p>(.*?)<')
+# 匹配公司名称
+companyNameRegex = re.compile(u'(?s)企业名称：.*?<p>(.*?)<')
+# 匹配公司地址
+companyAddressRegex = re .compile(u'(?s)生产地址：.*?<p>(.*?)<')
+# 电话
+phoneNumRegex = re.compile(u'(?s)联系电话：.*?<p>(.*?)<')
+
+# 处理抓取任务
+def loop(pageUrl):
+	global i,thirdCount,medicamentNameRegex,symptomRegex,companyAddressRegex,companyNameRegex
+	i += 1
+	# 文件名用数字
+	fNum = i;
+	# 表示新线程启动了
+	thirdCount += 1
+	
+	try:
+		request = urllib.urlopen(pageUrl)
+	except Exception, e:
+		# 减少一个线程
+		thirdCount -= 1
+		return
+	
+	try:
+		# 获得网页源码
+		html = request.read().decode('gbk')
+	except Exception, e:
+		# 关闭请求
+		request.close()
+		# 减少一个线程
+		thirdCount -= 1
+		return
+
+	# 正则匹配需要的数据
+	medicamentName = medicamentNameRegex.search(html)
+	symptom = symptomRegex.search(html)
+	companyName = companyNameRegex.search(html)
+	companyAddress = companyAddressRegex.search(html)
+	phoneNum = phoneNumRegex.search(html)
+	
+	if medicamentName or symptom or companyName or companyAddress or phoneNum:
+		f = open('/home/geekgao/data/' + str(fNum),'w')
+		if medicamentName:
+			f.write(medicamentName.group(1).encode('utf-8') + '\n')
+		if symptom:
+			f.write(symptom.group(1).encode('utf-8') + '\n')
+		if companyName:
+			f.write(companyName.group(1).encode('utf-8') + '\n')
+		if companyAddress:
+			f.write(companyAddress.group(1).encode('utf-8') + '\n')
+		if phoneNum:
+			f.write(phoneNum.group(1).encode('utf-8') + '\n')
+		f.close()
+		print pageUrl + '抓取成功!'
+	else:
+		print pageUrl + '抓取失败!'
+	
+	# 关闭请求
+	request.close()
+	# 减少一个线程
+	thirdCount -= 1
+	
+startTime = time.time()
+# 打开存储有需要抓取的网页链接的文件
+f = open('/home/geekgao/1','r')
+while True:
+	num = i + 1
+	# 线程要始终保持在50个
+	if thirdCount <= 50:
+		pageUrl = f.readline()
+		# 读完了就退出循环
+		if pageUrl == '':
+			break
+		print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount)
+		t = threading.Thread(target = loop, name = str(num) + " loopThird",args=(pageUrl,))
+		t.start()
+	time.sleep(0.001)
+
+thisStartTime = time.time()
+while thirdCount != 0:
+	# 等待超时就退出（没有这个有时候线程并不能全部退出，看资源管理器，说“等候频道 poll_scheme_time”）
+	if time.time() - thisStartTime > 10:
+		print "等待时间到,强行退出."
+		break
+	print "等待线程全部结束！还有" + str(thirdCount) + "个线程在工作"
+	time.sleep(0.010)
+endTime = time.time()
+
+allTime = endTime - startTime
+f.close()
+print "完成!花费时间:" + str(allTime) + "s"
\ No newline at end of file