|
11 | 11 | }, |
12 | 12 | { |
13 | 13 | "cell_type": "code", |
14 | | - "execution_count": 1, |
| 14 | + "execution_count": 14, |
15 | 15 | "metadata": {}, |
16 | 16 | "outputs": [], |
17 | 17 | "source": [ |
|
39 | 39 | }, |
40 | 40 | { |
41 | 41 | "cell_type": "code", |
42 | | - "execution_count": 2, |
| 42 | + "execution_count": 3, |
43 | 43 | "metadata": {}, |
44 | 44 | "outputs": [ |
45 | 45 | { |
|
48 | 48 | "12" |
49 | 49 | ] |
50 | 50 | }, |
51 | | - "execution_count": 2, |
| 51 | + "execution_count": 3, |
52 | 52 | "metadata": {}, |
53 | 53 | "output_type": "execute_result" |
54 | 54 | } |
|
61 | 61 | }, |
62 | 62 | { |
63 | 63 | "cell_type": "code", |
64 | | - "execution_count": 77, |
| 64 | + "execution_count": 4, |
65 | 65 | "metadata": {}, |
66 | 66 | "outputs": [ |
67 | 67 | { |
|
160 | 160 | "4 5.0 Innsbruck 118112 113392 126851 Tyrol" |
161 | 161 | ] |
162 | 162 | }, |
163 | | - "execution_count": 77, |
| 163 | + "execution_count": 4, |
164 | 164 | "metadata": {}, |
165 | 165 | "output_type": "execute_result" |
166 | 166 | } |
|
171 | 171 | }, |
172 | 172 | { |
173 | 173 | "cell_type": "code", |
174 | | - "execution_count": 4, |
| 174 | + "execution_count": 5, |
175 | 175 | "metadata": {}, |
176 | 176 | "outputs": [ |
177 | 177 | { |
|
250 | 250 | "4 Burgenland Bad Tatzmannsdorf Other municipality 1554" |
251 | 251 | ] |
252 | 252 | }, |
253 | | - "execution_count": 4, |
| 253 | + "execution_count": 5, |
254 | 254 | "metadata": {}, |
255 | 255 | "output_type": "execute_result" |
256 | 256 | } |
|
268 | 268 | }, |
269 | 269 | { |
270 | 270 | "cell_type": "code", |
271 | | - "execution_count": 79, |
| 271 | + "execution_count": 6, |
272 | 272 | "metadata": {}, |
273 | 273 | "outputs": [ |
274 | 274 | { |
|
301 | 301 | }, |
302 | 302 | { |
303 | 303 | "cell_type": "code", |
304 | | - "execution_count": 80, |
| 304 | + "execution_count": 7, |
305 | 305 | "metadata": {}, |
306 | 306 | "outputs": [ |
307 | 307 | { |
|
310 | 310 | "'<!DOCTYPE html>\\n<html lang=\"de\">\\n<head>\\n<meta charset=\"utf-8\" />\\n<meta http-equiv=\"cache-control\" content=\"no-cache\" />\\n<meta http-equiv=\"pragma\" content=\"no-cache\" />\\n<meta name=\"robots\" content=\"index, follow, noarchive\" />\\n<meta name=\"content-language\" content=\"de\" />\\n<meta name=\"description\" content=\"Am Donnerstag ging es zwar "nur" um eine Impfkampagne. Der steirische Landeshauptmann Hermann Schützenhöfer plant aber die Corona-Impfung als Voraussetzung bei Neueinstellung von Kindergartenpersonal.\" />\\n<meta name=\"author\" content=\"Martina Madner\" />\\n<meta name=\"copyright\" content=\"Wiener Zeitung Online\" />\\n<meta property=\"og:type\" content=\"article\" />\\n<meta property=\"og:title\" content=\"Impfpflicht - Härtere steirische Impfanreize\" />\\n<meta property=\"og:description\" content=\"Am Donnerstag ging es zwar "nur" um eine Impfkampagne. Der steirische Landeshauptmann Hermann Schützenhöfer plant aber die Corona-Impfung als...\" />\\n<meta property=\"og'" |
311 | 311 | ] |
312 | 312 | }, |
313 | | - "execution_count": 80, |
| 313 | + "execution_count": 7, |
314 | 314 | "metadata": {}, |
315 | 315 | "output_type": "execute_result" |
316 | 316 | } |
|
328 | 328 | }, |
329 | 329 | { |
330 | 330 | "cell_type": "code", |
331 | | - "execution_count": 81, |
| 331 | + "execution_count": 8, |
332 | 332 | "metadata": {}, |
333 | 333 | "outputs": [ |
334 | 334 | { |
|
344 | 344 | "print(type(soup))" |
345 | 345 | ] |
346 | 346 | }, |
| 347 | + { |
| 348 | + "cell_type": "markdown", |
| 349 | + "metadata": {}, |
| 350 | + "source": [ |
| 351 | + "#### A quick note on parsers:" |
| 352 | + ] |
| 353 | + }, |
| 354 | + { |
| 355 | + "cell_type": "code", |
| 356 | + "execution_count": 23, |
| 357 | + "metadata": {}, |
| 358 | + "outputs": [ |
| 359 | + { |
| 360 | + "data": { |
| 361 | + "text/html": [ |
| 362 | + "<div>\n", |
| 363 | + "<style scoped>\n", |
| 364 | + " .dataframe tbody tr th:only-of-type {\n", |
| 365 | + " vertical-align: middle;\n", |
| 366 | + " }\n", |
| 367 | + "\n", |
| 368 | + " .dataframe tbody tr th {\n", |
| 369 | + " vertical-align: top;\n", |
| 370 | + " }\n", |
| 371 | + "\n", |
| 372 | + " .dataframe thead th {\n", |
| 373 | + " text-align: right;\n", |
| 374 | + " }\n", |
| 375 | + "</style>\n", |
| 376 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 377 | + " <thead>\n", |
| 378 | + " <tr style=\"text-align: right;\">\n", |
| 379 | + " <th></th>\n", |
| 380 | + " <th>Parser</th>\n", |
| 381 | + " <th>Typical usage</th>\n", |
| 382 | + " <th>Advantages</th>\n", |
| 383 | + " <th>Disadvantages</th>\n", |
| 384 | + " </tr>\n", |
| 385 | + " </thead>\n", |
| 386 | + " <tbody>\n", |
| 387 | + " <tr>\n", |
| 388 | + " <th>1</th>\n", |
| 389 | + " <td>Python’s html.parser</td>\n", |
| 390 | + " <td>BeautifulSoup(markup, \"html.parser\")</td>\n", |
| 391 | + " <td>Batteries included Decent speed Lenient (As of...</td>\n", |
| 392 | + " <td>Not as fast as lxml, less lenient than html5lib.</td>\n", |
| 393 | + " </tr>\n", |
| 394 | + " <tr>\n", |
| 395 | + " <th>2</th>\n", |
| 396 | + " <td>lxml’s HTML parser</td>\n", |
| 397 | + " <td>BeautifulSoup(markup, \"lxml\")</td>\n", |
| 398 | + " <td>Very fast Lenient</td>\n", |
| 399 | + " <td>External C dependency</td>\n", |
| 400 | + " </tr>\n", |
| 401 | + " <tr>\n", |
| 402 | + " <th>3</th>\n", |
| 403 | + " <td>lxml’s XML parser</td>\n", |
| 404 | + " <td>BeautifulSoup(markup, \"lxml-xml\") BeautifulSou...</td>\n", |
| 405 | + " <td>Very fast The only currently supported XML parser</td>\n", |
| 406 | + " <td>External C dependency</td>\n", |
| 407 | + " </tr>\n", |
| 408 | + " <tr>\n", |
| 409 | + " <th>4</th>\n", |
| 410 | + " <td>html5lib</td>\n", |
| 411 | + " <td>BeautifulSoup(markup, \"html5lib\")</td>\n", |
| 412 | + " <td>Extremely lenient Parses pages the same way a ...</td>\n", |
| 413 | + " <td>Very slow External Python dependency</td>\n", |
| 414 | + " </tr>\n", |
| 415 | + " </tbody>\n", |
| 416 | + "</table>\n", |
| 417 | + "</div>" |
| 418 | + ], |
| 419 | + "text/plain": [ |
| 420 | + "0 Parser Typical usage \\\n", |
| 421 | + "1 Python’s html.parser BeautifulSoup(markup, \"html.parser\") \n", |
| 422 | + "2 lxml’s HTML parser BeautifulSoup(markup, \"lxml\") \n", |
| 423 | + "3 lxml’s XML parser BeautifulSoup(markup, \"lxml-xml\") BeautifulSou... \n", |
| 424 | + "4 html5lib BeautifulSoup(markup, \"html5lib\") \n", |
| 425 | + "\n", |
| 426 | + "0 Advantages \\\n", |
| 427 | + "1 Batteries included Decent speed Lenient (As of... \n", |
| 428 | + "2 Very fast Lenient \n", |
| 429 | + "3 Very fast The only currently supported XML parser \n", |
| 430 | + "4 Extremely lenient Parses pages the same way a ... \n", |
| 431 | + "\n", |
| 432 | + "0 Disadvantages \n", |
| 433 | + "1 Not as fast as lxml, less lenient than html5lib. \n", |
| 434 | + "2 External C dependency \n", |
| 435 | + "3 External C dependency \n", |
| 436 | + "4 Very slow External Python dependency " |
| 437 | + ] |
| 438 | + }, |
| 439 | + "execution_count": 23, |
| 440 | + "metadata": {}, |
| 441 | + "output_type": "execute_result" |
| 442 | + } |
| 443 | + ], |
| 444 | + "source": [ |
| 445 | + "df_parsers = pd.read_html(\"https://www.crummy.com/software/BeautifulSoup/bs4/doc/\")[0]\n", |
| 446 | + "df_parsers.columns = df_parsers.iloc[0]\n", |
| 447 | + "df_parsers.drop(df_parsers.index[0], inplace=True)\n", |
| 448 | + "df_parsers.head()" |
| 449 | + ] |
| 450 | + }, |
347 | 451 | { |
348 | 452 | "cell_type": "markdown", |
349 | 453 | "metadata": {}, |
|
353 | 457 | }, |
354 | 458 | { |
355 | 459 | "cell_type": "code", |
356 | | - "execution_count": 82, |
| 460 | + "execution_count": 9, |
357 | 461 | "metadata": {}, |
358 | 462 | "outputs": [ |
359 | 463 | { |
|
391 | 495 | }, |
392 | 496 | { |
393 | 497 | "cell_type": "code", |
394 | | - "execution_count": 84, |
| 498 | + "execution_count": 10, |
395 | 499 | "metadata": {}, |
396 | 500 | "outputs": [ |
397 | 501 | { |
398 | 502 | "name": "stdout", |
399 | 503 | "output_type": "stream", |
400 | 504 | "text": [ |
401 | | - "ete Werbekampagne fürs Impfen in der Steiermark. Es spricht von einer \\\"gewissen Impfmüdigkeit, die eingetreten ist, da rede ich nicht um den heißen Brei herum: Die Anmeldungen sind zurückgegangen.\\\" 560.000 von 1,1 Millionen impfbaren Steirerinnen und Steirern sind erstgeimpft, 227.000 auch ein zweites Mal. Sein Appell: \\\"Wir werden alles versuchen, damit sich die Menschen impfen lassen. Es ist noch nicht vorbei.\\\"<\\/p>\\n<p>In der ORF-\\\"Pressestunde\\\" sprach Schützenhöfer von einem \\\"schäbigem Verhalten\\\" der Impfunwilligen. Auf die Frage, ob er sich eine Sars-CoV-2-Schutzimpfungspflicht für das Kindergartenpersonal vorstellen könne, sagte er: \\\"Ich wäre persönlich dafür, der Verfassungsdienst des Bundes sagt aber: Nein, das geht nicht\\\" - und ließ dann aufhorchen: \\\"Aber: Jedes Land kann selber etwas bei der Aufnahme tun. Das will ich in den Bereichen des Landes selbstverständlich so machen.\\\"<\\/p>\\n<p>Impfnachz&\n" |
| 505 | + "un gestartete Werbekampagne fürs Impfen in der Steiermark. Es spricht von einer \\\"gewissen Impfmüdigkeit, die eingetreten ist, da rede ich nicht um den heißen Brei herum: Die Anmeldungen sind zurückgegangen.\\\" 560.000 von 1,1 Millionen impfbaren Steirerinnen und Steirern sind erstgeimpft, 227.000 auch ein zweites Mal. Sein Appell: \\\"Wir werden alles versuchen, damit sich die Menschen impfen lassen. Es ist noch nicht vorbei.\\\"<\\/p>\\n<p>In der ORF-\\\"Pressestunde\\\" sprach Schützenhöfer von einem \\\"schäbigem Verhalten\\\" der Impfunwilligen. Auf die Frage, ob er sich eine Sars-CoV-2-Schutzimpfungspflicht für das Kindergartenpersonal vorstellen könne, sagte er: \\\"Ich wäre persönlich dafür, der Verfassungsdienst des Bundes sagt aber: Nein, das geht nicht\\\" - und ließ dann aufhorchen: \\\"Aber: Jedes Land kann selber etwas bei der Aufnahme tun. Das will ich in den Bereichen des Landes selbstverständlich so machen.\\\"<\\/p>\\n<p>\n" |
402 | 506 | ] |
403 | 507 | } |
404 | 508 | ], |
|
408 | 512 | }, |
409 | 513 | { |
410 | 514 | "cell_type": "code", |
411 | | - "execution_count": 85, |
| 515 | + "execution_count": 11, |
412 | 516 | "metadata": {}, |
413 | 517 | "outputs": [ |
414 | 518 | { |
|
709 | 813 | "cell_type": "markdown", |
710 | 814 | "metadata": {}, |
711 | 815 | "source": [ |
712 | | - "article### Extracting specific information" |
| 816 | + "### Extracting specific information" |
713 | 817 | ] |
714 | 818 | }, |
715 | 819 | { |
|
3809 | 3913 | "name": "python", |
3810 | 3914 | "nbconvert_exporter": "python", |
3811 | 3915 | "pygments_lexer": "ipython3", |
3812 | | - "version": "3.8.5" |
| 3916 | + "version": "3.7.6" |
3813 | 3917 | } |
3814 | 3918 | }, |
3815 | 3919 | "nbformat": 4, |
|
0 commit comments