Skip to content

Commit 47fd327

Browse files
committed
Update 13_Scraping.ipynb
1 parent 670969c commit 47fd327

File tree

1 file changed

+53
-3
lines changed

1 file changed

+53
-3
lines changed

Notebooks/Python_Tasks/13_Scraping.ipynb

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1735,6 +1735,15 @@
17351735
"### 3 - Parsing dynamically updated data via javascript"
17361736
]
17371737
},
1738+
{
1739+
"cell_type": "code",
1740+
"execution_count": null,
1741+
"metadata": {},
1742+
"outputs": [],
1743+
"source": [
1744+
"https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python"
1745+
]
1746+
},
17381747
{
17391748
"cell_type": "code",
17401749
"execution_count": 178,
@@ -1777,21 +1786,62 @@
17771786
"execution_count": null,
17781787
"metadata": {},
17791788
"outputs": [],
1780-
"source": []
1789+
"source": [
1790+
"import requests\n",
1791+
"from bs4 import BeautifulSoup\n",
1792+
"response = requests.get(my_url)\n",
1793+
"soup = BeautifulSoup(response.text)\n",
1794+
"soup.find(id=\"intro-text\")\n",
1795+
"# Result:\n",
1796+
"<p id=\"intro-text\">No javascript support</p>"
1797+
]
1798+
},
1799+
{
1800+
"cell_type": "code",
1801+
"execution_count": null,
1802+
"metadata": {},
1803+
"outputs": [],
1804+
"source": [
1805+
"from selenium import webdriver\n",
1806+
"driver = webdriver.PhantomJS()\n",
1807+
"driver.get(my_url)\n",
1808+
"p_element = driver.find_element_by_id(id_='intro-text')\n",
1809+
"print(p_element.text)\n",
1810+
"# result:\n",
1811+
"'Yay! Supports javascript'\n"
1812+
]
17811813
},
17821814
{
17831815
"cell_type": "code",
17841816
"execution_count": null,
17851817
"metadata": {},
17861818
"outputs": [],
1787-
"source": []
1819+
"source": [
1820+
"from selenium import webdriver\n",
1821+
"import time\n",
1822+
"\n",
1823+
"driver = webdriver.Firefox()\n",
1824+
"driver.get(url)\n",
1825+
"time.sleep(5)\n",
1826+
"htmlSource = driver.page_source"
1827+
]
17881828
},
17891829
{
17901830
"cell_type": "code",
17911831
"execution_count": null,
17921832
"metadata": {},
17931833
"outputs": [],
1794-
"source": []
1834+
"source": [
1835+
"import dryscrape\n",
1836+
"from bs4 import BeautifulSoup\n",
1837+
"session = dryscrape.Session()\n",
1838+
"session.visit(my_url)\n",
1839+
"response = session.body()\n",
1840+
"soup = BeautifulSoup(response)\n",
1841+
"soup.find(id=\"intro-text\")\n",
1842+
"# Result:\n",
1843+
"<p id=\"intro-text\">Yay! Supports javascript</p>"
1844+
]
17951845
},
17961846
{
17971847
"cell_type": "markdown",

0 commit comments

Comments
 (0)