|
1735 | 1735 | "### 3 - Parsing dynamically updated data via javascript" |
1736 | 1736 | ] |
1737 | 1737 | }, |
| 1738 | + { |
| 1739 | + "cell_type": "code", |
| 1740 | + "execution_count": null, |
| 1741 | + "metadata": {}, |
| 1742 | + "outputs": [], |
| 1743 | + "source": [ |
| 1744 | + "https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python" |
| 1745 | + ] |
| 1746 | + }, |
1738 | 1747 | { |
1739 | 1748 | "cell_type": "code", |
1740 | 1749 | "execution_count": 178, |
|
1777 | 1786 | "execution_count": null, |
1778 | 1787 | "metadata": {}, |
1779 | 1788 | "outputs": [], |
1780 | | - "source": [] |
| 1789 | + "source": [ |
| 1790 | + "import requests\n", |
| 1791 | + "from bs4 import BeautifulSoup\n", |
| 1792 | + "response = requests.get(my_url)\n", |
| 1793 | + "soup = BeautifulSoup(response.text)\n", |
| 1794 | + "soup.find(id=\"intro-text\")\n", |
| 1795 | + "# Result:\n", |
| 1796 | + "<p id=\"intro-text\">No javascript support</p>" |
| 1797 | + ] |
| 1798 | + }, |
| 1799 | + { |
| 1800 | + "cell_type": "code", |
| 1801 | + "execution_count": null, |
| 1802 | + "metadata": {}, |
| 1803 | + "outputs": [], |
| 1804 | + "source": [ |
| 1805 | + "from selenium import webdriver\n", |
| 1806 | + "driver = webdriver.PhantomJS()\n", |
| 1807 | + "driver.get(my_url)\n", |
| 1808 | + "p_element = driver.find_element_by_id(id_='intro-text')\n", |
| 1809 | + "print(p_element.text)\n", |
| 1810 | + "# result:\n", |
| 1811 | + "'Yay! Supports javascript'\n" |
| 1812 | + ] |
1781 | 1813 | }, |
1782 | 1814 | { |
1783 | 1815 | "cell_type": "code", |
1784 | 1816 | "execution_count": null, |
1785 | 1817 | "metadata": {}, |
1786 | 1818 | "outputs": [], |
1787 | | - "source": [] |
| 1819 | + "source": [ |
| 1820 | + "from selenium import webdriver\n", |
| 1821 | + "import time\n", |
| 1822 | + "\n", |
| 1823 | + "driver = webdriver.Firefox()\n", |
| 1824 | + "driver.get(url)\n", |
| 1825 | + "time.sleep(5)\n", |
| 1826 | + "htmlSource = driver.page_source" |
| 1827 | + ] |
1788 | 1828 | }, |
1789 | 1829 | { |
1790 | 1830 | "cell_type": "code", |
1791 | 1831 | "execution_count": null, |
1792 | 1832 | "metadata": {}, |
1793 | 1833 | "outputs": [], |
1794 | | - "source": [] |
| 1834 | + "source": [ |
| 1835 | + "import dryscrape\n", |
| 1836 | + "from bs4 import BeautifulSoup\n", |
| 1837 | + "session = dryscrape.Session()\n", |
| 1838 | + "session.visit(my_url)\n", |
| 1839 | + "response = session.body()\n", |
| 1840 | + "soup = BeautifulSoup(response)\n", |
| 1841 | + "soup.find(id=\"intro-text\")\n", |
| 1842 | + "# Result:\n", |
| 1843 | + "<p id=\"intro-text\">Yay! Supports javascript</p>" |
| 1844 | + ] |
1795 | 1845 | }, |
1796 | 1846 | { |
1797 | 1847 | "cell_type": "markdown", |
|
0 commit comments