From 60458b26b068e9fb6cff94cec90197476c5053dd Mon Sep 17 00:00:00 2001 From: razzl Date: Mon, 6 Apr 2015 14:16:52 +0800 Subject: [PATCH 1/5] Create 0008.py --- razzl/0008.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 razzl/0008.py diff --git a/razzl/0008.py b/razzl/0008.py new file mode 100644 index 00000000..542efcca --- /dev/null +++ b/razzl/0008.py @@ -0,0 +1,19 @@ +from __future__ import division#division +import re +import urllib2 + +url = 'http://world.cankaoxiaoxi.com/2015/0404/730644.shtml' +html = urllib2.urlopen(url).read() +html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the script +html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the style +html = re.split("[\r\n]+",html)#split +for line in html: + if line.strip()=='': + continue + line_sub = re.sub(r'<[^>]*>','',line)#record the words in a line + if len(line_sub)/len(line) >= 0.5:#compare the text of the density + if(line_sub.strip()!=''): + print line_sub.strip() + + + From 4224771eb34fcdfe9a2a42e7a3f0341816a487cd Mon Sep 17 00:00:00 2001 From: razzl Date: Mon, 6 Apr 2015 14:17:04 +0800 Subject: [PATCH 2/5] Delete 0008.py --- razzl/0008.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 razzl/0008.py diff --git a/razzl/0008.py b/razzl/0008.py deleted file mode 100644 index 542efcca..00000000 --- a/razzl/0008.py +++ /dev/null @@ -1,19 +0,0 @@ -from __future__ import division#division -import re -import urllib2 - -url = 'http://world.cankaoxiaoxi.com/2015/0404/730644.shtml' -html = urllib2.urlopen(url).read() -html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the script -html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the style -html = re.split("[\r\n]+",html)#split -for line in html: - if line.strip()=='': - continue - line_sub = re.sub(r'<[^>]*>','',line)#record the words in a line - if len(line_sub)/len(line) >= 0.5:#compare the text of the density - if(line_sub.strip()!=''): - print line_sub.strip() - - - From 200eaf7f73a93a0f625201b93e782cee5b29dba4 Mon Sep 17 00:00:00 2001 From: razzl Date: Mon, 6 Apr 2015 14:17:24 +0800 Subject: [PATCH 3/5] Create 0008.py --- razzl/0008/0008.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 razzl/0008/0008.py diff --git a/razzl/0008/0008.py b/razzl/0008/0008.py new file mode 100644 index 00000000..542efcca --- /dev/null +++ b/razzl/0008/0008.py @@ -0,0 +1,19 @@ +from __future__ import division#division +import re +import urllib2 + +url = 'http://world.cankaoxiaoxi.com/2015/0404/730644.shtml' +html = urllib2.urlopen(url).read() +html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the script +html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the style +html = re.split("[\r\n]+",html)#split +for line in html: + if line.strip()=='': + continue + line_sub = re.sub(r'<[^>]*>','',line)#record the words in a line + if len(line_sub)/len(line) >= 0.5:#compare the text of the density + if(line_sub.strip()!=''): + print line_sub.strip() + + + From c78c964e2f610bf3fafa76d20c613d7e7678865e Mon Sep 17 00:00:00 2001 From: razzl Date: Mon, 6 Apr 2015 14:17:48 +0800 Subject: [PATCH 4/5] Create 0009.py --- razzl/0009/0009.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 razzl/0009/0009.py diff --git a/razzl/0009/0009.py b/razzl/0009/0009.py new file mode 100644 index 00000000..3108f6b3 --- /dev/null +++ b/razzl/0009/0009.py @@ -0,0 +1,9 @@ +import re +import urllib2 + +url = 'http://www.cnblogs.com/jasondan/p/3497757.html' +html = urllib2.urlopen(url).read() +links = re.findall(r'<[^>]+src="([^>]+)"[^>]*>|<[^>]+href="([^>]+)"[^>]*>',html)#find the link +for link in links: + for lin in link: + print lin From c2419c68fdf6d7f032eba9425b3d37452361002a Mon Sep 17 00:00:00 2001 From: razzl Date: Sun, 12 Apr 2015 13:41:27 +0800 Subject: [PATCH 5/5] Create 0013.py --- razzl/0013/0013.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 razzl/0013/0013.py diff --git a/razzl/0013/0013.py b/razzl/0013/0013.py new file mode 100644 index 00000000..efc58185 --- /dev/null +++ b/razzl/0013/0013.py @@ -0,0 +1,13 @@ +import urllib2 +import urllib +import re + +url = 'http://tieba.baidu.com/p/2166231880' +html = urllib2.urlopen(url).read() + +photos = re.findall(r'class="BDE_Image" src="([^"]+)"',html) +i=0 +for photo in photos: + urllib.urlretrieve(photo,'C:/Users/zzl/Desktop/1/'+str(i)+'.jpg') + i += 1 +