Skip to content

Commit 2b708ac

Browse files
committed
adding scrapying II
1 parent 1ab0b88 commit 2b708ac

File tree

1 file changed

+182
-0
lines changed

1 file changed

+182
-0
lines changed

Notebooks/Untitled.ipynb

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"#import pylab\n",
10+
"import re\n",
11+
"import pandas as pd\n",
12+
"import numpy as np\n",
13+
"import matplotlib.pyplot as plt\n",
14+
"import seaborn as sns\n",
15+
"\n",
16+
"import requests\n",
17+
"import urllib\n",
18+
"from bs4 import BeautifulSoup as bs\n",
19+
"\n",
20+
"%matplotlib inline\n",
21+
"#%matplotlib notebook"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 2,
27+
"metadata": {},
28+
"outputs": [
29+
{
30+
"name": "stdout",
31+
"output_type": "stream",
32+
"text": [
33+
"200\n"
34+
]
35+
}
36+
],
37+
"source": [
38+
"url2 = 'https://ge.globo.com/futebol/brasileirao-serie-a/' \n",
39+
"content = requests.get(url2)\n",
40+
"print(content.status_code)"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": 4,
46+
"metadata": {},
47+
"outputs": [
48+
{
49+
"name": "stdout",
50+
"output_type": "stream",
51+
"text": [
52+
"<class 'bs4.BeautifulSoup'>\n"
53+
]
54+
}
55+
],
56+
"source": [
57+
"soup = bs(content.text,'html5lib')\n",
58+
"print(type(soup))"
59+
]
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": 10,
64+
"metadata": {},
65+
"outputs": [
66+
{
67+
"data": {
68+
"text/plain": [
69+
"[]"
70+
]
71+
},
72+
"execution_count": 10,
73+
"metadata": {},
74+
"output_type": "execute_result"
75+
}
76+
],
77+
"source": [
78+
"soup.find_all('div', attrs={'class':'classificacao_pontos-corridos'})"
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": 20,
84+
"metadata": {},
85+
"outputs": [],
86+
"source": [
87+
"table = soup.find_all(class_=\"medium-centered\")[0]"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 28,
93+
"metadata": {},
94+
"outputs": [
95+
{
96+
"name": "stderr",
97+
"output_type": "stream",
98+
"text": [
99+
"/opt/conda/lib/python3.7/site-packages/selenium/webdriver/phantomjs/webdriver.py:49: UserWarning: Selenium support for PhantomJS has been deprecated, please use headless versions of Chrome or Firefox instead\n",
100+
" warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless '\n"
101+
]
102+
},
103+
{
104+
"ename": "WebDriverException",
105+
"evalue": "Message: 'phantomjs' executable needs to be in PATH. \n",
106+
"output_type": "error",
107+
"traceback": [
108+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
109+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
110+
"\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 76\u001b[0;31m stdin=PIPE)\n\u001b[0m\u001b[1;32m 77\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
111+
"\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[1;32m 799\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 800\u001b[0;31m restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m 801\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
112+
"\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1550\u001b[0m \u001b[0merr_msg\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m': '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1551\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1552\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
113+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'phantomjs': 'phantomjs'",
114+
"\nDuring handling of the above exception, another exception occurred:\n",
115+
"\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)",
116+
"\u001b[0;32m<ipython-input-28-ab66e713d1f1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdriver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPhantomJS\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmy_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mp_element\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid_\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'intro-text'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp_element\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
117+
"\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/phantomjs/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, executable_path, port, desired_capabilities, service_args, service_log_path)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0mservice_args\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mservice_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m log_path=service_log_path)\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
118+
"\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 81\u001b[0m raise WebDriverException(\n\u001b[1;32m 82\u001b[0m \"'%s' executable needs to be in PATH. %s\" % (\n\u001b[0;32m---> 83\u001b[0;31m os.path.basename(self.path), self.start_error_message)\n\u001b[0m\u001b[1;32m 84\u001b[0m )\n\u001b[1;32m 85\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEACCES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
119+
"\u001b[0;31mWebDriverException\u001b[0m: Message: 'phantomjs' executable needs to be in PATH. \n"
120+
]
121+
}
122+
],
123+
"source": [
124+
"from selenium import webdriver\n",
125+
"driver = webdriver.PhantomJS()\n",
126+
"driver.get(my_url)\n",
127+
"p_element = driver.find_element_by_id(id_='intro-text')\n",
128+
"print(p_element.text)"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 27,
134+
"metadata": {},
135+
"outputs": [
136+
{
137+
"name": "stdout",
138+
"output_type": "stream",
139+
"text": [
140+
"Collecting selenium\n",
141+
" Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)\n",
142+
"\u001b[K |████████████████████████████████| 904 kB 4.7 MB/s eta 0:00:01\n",
143+
"\u001b[?25hRequirement already satisfied: urllib3 in /opt/conda/lib/python3.7/site-packages (from selenium) (1.25.7)\n",
144+
"Installing collected packages: selenium\n",
145+
"Successfully installed selenium-3.141.0\n"
146+
]
147+
}
148+
],
149+
"source": [
150+
"! pip install selenium"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {},
157+
"outputs": [],
158+
"source": []
159+
}
160+
],
161+
"metadata": {
162+
"kernelspec": {
163+
"display_name": "Python 3",
164+
"language": "python",
165+
"name": "python3"
166+
},
167+
"language_info": {
168+
"codemirror_mode": {
169+
"name": "ipython",
170+
"version": 3
171+
},
172+
"file_extension": ".py",
173+
"mimetype": "text/x-python",
174+
"name": "python",
175+
"nbconvert_exporter": "python",
176+
"pygments_lexer": "ipython3",
177+
"version": "3.7.6"
178+
}
179+
},
180+
"nbformat": 4,
181+
"nbformat_minor": 4
182+
}

0 commit comments

Comments
 (0)