Skip to content

Commit 5e1a455

Browse files
committed
Completed the 16th episode
1 parent 4cc7214 commit 5e1a455

File tree

10 files changed

+1015
-572
lines changed

10 files changed

+1015
-572
lines changed

Course/.DS_Store

0 Bytes
Binary file not shown.

Course/13_asychronous_web_scraping/.ipynb_checkpoints/asynchronous-web-scraping-with-aiohttp-and-asyncio-Copy1-checkpoint.ipynb

Lines changed: 0 additions & 531 deletions
This file was deleted.

Course/13_asychronous_web_scraping/asynchronous-web-scraping-with-aiohttp-and-asyncio-starter-code.ipynb

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
},
7575
{
7676
"cell_type": "code",
77-
"execution_count": 7,
77+
"execution_count": 2,
7878
"metadata": {},
7979
"outputs": [
8080
{
@@ -165,7 +165,7 @@
165165
},
166166
{
167167
"cell_type": "code",
168-
"execution_count": 10,
168+
"execution_count": 6,
169169
"metadata": {},
170170
"outputs": [
171171
{
@@ -757,7 +757,7 @@
757757
" <div class=\"small-widget download-widget\">\n",
758758
" <h2 class=\"widget-title\"><span aria-hidden=\"true\" class=\"icon-download\"></span>Download</h2>\n",
759759
"<p>Python source code and installers are available for download for all versions!</p>\n",
760-
"<p>Latest: <a href=\"/downloads/release/python-390/\">Python 3.9.0</a></p>\n",
760+
"<p>Latest: <a href=\"/downloads/release/python-391/\">Python 3.9.1</a></p>\n",
761761
" </div>\n",
762762
"\n",
763763
" <div class=\"small-widget documentation-widget\">\n",
@@ -787,24 +787,24 @@
787787
" \n",
788788
" \n",
789789
" <li>\n",
790-
"<time datetime=\"2020-12-01T15:46:00.000001+00:00\"><span class=\"say-no-more\">2020-</span>12-01</time>\n",
791-
" <a href=\"http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/kJNxOgJs0jI/help-psf-raise-60000-usd-by-december.html\">Help the PSF raise $60,000 USD by December 31st!</a></li>\n",
790+
"<time datetime=\"2020-12-11T09:08:00.000004+00:00\"><span class=\"say-no-more\">2020-</span>12-11</time>\n",
791+
" <a href=\"http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/9OlQ_iCm9HA/elaine-wong-awarded-psf-community.html\">Elaine Wong Awarded the PSF Community Service Award for Q3 2020</a></li>\n",
792792
" \n",
793793
" <li>\n",
794-
"<time datetime=\"2020-11-30T12:53:00+00:00\"><span class=\"say-no-more\">2020-</span>11-30</time>\n",
795-
" <a href=\"http://feedproxy.google.com/~r/PythonInsider/~3/EEnvsGTMFZw/pip-20-3-release-new-resolver.html\">Releasing pip 20.3, featuring new dependency resolver</a></li>\n",
794+
"<time datetime=\"2020-12-10T16:03:30.000003+00:00\"><span class=\"say-no-more\">2020-</span>12-10</time>\n",
795+
" <a href=\"https://mailchi.mp/python/psf-sept-626488\">Python Software Foundation - December 2020 Newsletter</a></li>\n",
796796
" \n",
797797
" <li>\n",
798-
"<time datetime=\"2020-11-30T12:53:00+00:00\"><span class=\"say-no-more\">2020-</span>11-30</time>\n",
799-
" <a href=\"http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/9FGHV2ipjlQ/pip-20-3-new-resolver.html\">Releasing pip 20.3, featuring new dependency resolver</a></li>\n",
798+
"<time datetime=\"2020-12-09T12:58:00.000002+00:00\"><span class=\"say-no-more\">2020-</span>12-09</time>\n",
799+
" <a href=\"http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/WSiD7sUHgKY/announcing-psf-diversity-and-inclusion.html\">Announcing the PSF Diversity and Inclusion Work Group</a></li>\n",
800800
" \n",
801801
" <li>\n",
802-
"<time datetime=\"2020-11-20T20:48:00.000004+00:00\"><span class=\"say-no-more\">2020-</span>11-20</time>\n",
803-
" <a href=\"http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/GyAo4JPW_RQ/pypi-receives-aws-credits-for-open.html\">PyPI receives AWS credits for open source projects</a></li>\n",
802+
"<time datetime=\"2020-12-08T01:36:00.000001+00:00\"><span class=\"say-no-more\">2020-</span>12-08</time>\n",
803+
" <a href=\"http://feedproxy.google.com/~r/PythonInsider/~3/VK02PDdS-VU/python-391-is-now-available-together.html\">Python 3.9.1 is now available, together with 3.10.0a3 and 3.8.7rc1</a></li>\n",
804804
" \n",
805805
" <li>\n",
806-
"<time datetime=\"2020-11-11T09:53:00.000002+00:00\"><span class=\"say-no-more\">2020-</span>11-11</time>\n",
807-
" <a href=\"http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/04NQ7_WSg9A/rami-chowdhury-awarded-psf-community.html\">Rami Chowdhury Awarded the PSF Community Service Award for Q3 2020</a></li>\n",
806+
"<time datetime=\"2020-12-01T15:46:00.000001+00:00\"><span class=\"say-no-more\">2020-</span>12-01</time>\n",
807+
" <a href=\"http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/kJNxOgJs0jI/help-psf-raise-60000-usd-by-december.html\">Help the PSF raise $60,000 USD by December 31st!</a></li>\n",
808808
" \n",
809809
" </ul>\n",
810810
" </div><!-- end .shrubbery -->\n",
@@ -823,12 +823,6 @@
823823
" \n",
824824
" \n",
825825
" <li>\n",
826-
"<time datetime=\"2020-12-11T00:00:00+00:00\"><span class=\"say-no-more\">2020-</span>12-11</time>\n",
827-
" <a href=\"/events/python-events/996/\">PyCode Conference 2020</a></li>\n",
828-
" \n",
829-
" \n",
830-
" \n",
831-
" <li>\n",
832826
"<time datetime=\"2020-12-12T06:00:00+00:00\"><span class=\"say-no-more\">2020-</span>12-12</time>\n",
833827
" <a href=\"/events/python-user-group/993/\">Python Mauritius User Group (PyMUG) - December Meetup</a></li>\n",
834828
" \n",
@@ -841,14 +835,20 @@
841835
" \n",
842836
" \n",
843837
" <li>\n",
844-
"<time datetime=\"2021-01-30T00:00:00+00:00\"><span class=\"say-no-more\">2021-</span>01-30</time>\n",
845-
" <a href=\"/events/python-events/994/\">BelPy 2021</a></li>\n",
838+
"<time datetime=\"2020-12-17T18:00:00+00:00\"><span class=\"say-no-more\">2020-</span>12-17</time>\n",
839+
" <a href=\"/events/python-user-group/1000/\">Pykonik Tech Talks #54</a></li>\n",
840+
" \n",
841+
" \n",
842+
" \n",
843+
" <li>\n",
844+
"<time datetime=\"2020-12-31T00:00:00+00:00\"><span class=\"say-no-more\">2020-</span>12-31</time>\n",
845+
" <a href=\"/events/python-events/1001/\">Python Pizza New Year&#39;s Party</a></li>\n",
846846
" \n",
847847
" \n",
848848
" \n",
849849
" <li>\n",
850-
"<time datetime=\"2021-02-19T00:00:00+00:00\"><span class=\"say-no-more\">2021-</span>02-19</time>\n",
851-
" <a href=\"/events/python-events/997/\">PyCascades 2021</a></li>\n",
850+
"<time datetime=\"2021-01-30T00:00:00+00:00\"><span class=\"say-no-more\">2021-</span>01-30</time>\n",
851+
" <a href=\"/events/python-events/994/\">BelPy 2021</a></li>\n",
852852
" \n",
853853
" \n",
854854
" </ul>\n",
@@ -872,17 +872,17 @@
872872
" <p class=\"give-me-more\"><a href=\"/success-stories/\" title=\"More Success Stories\">More</a></p>\n",
873873
"\n",
874874
" \n",
875-
" <div class=\"success-story-item\" id=\"success-story-930\">\n",
875+
" <div class=\"success-story-item\" id=\"success-story-838\">\n",
876876
"\n",
877877
" <blockquote>\n",
878-
" <a href=\"/success-stories/python-for-financial-machine-learning-at-union-investment/\">Python and its broad variety of libraries are very well suited to develop customized machine learning tools which tackle the complex challenges posed by financial time series.</a>\n",
878+
" <a href=\"/success-stories/saving-the-world-with-open-data-and-python/\">When an Open Data standard is created and promoted, it’s important to think why - what change is this trying to drive? What will people do with this data that they couldn’t do before?</a>\n",
879879
" </blockquote>\n",
880880
"\n",
881881
" <table cellpadding=\"0\" cellspacing=\"0\" border=\"0\" width=\"100%\" class=\"quote-from\">\n",
882882
" <tbody>\n",
883883
" <tr>\n",
884884
" \n",
885-
" <td><p><a href=\"/success-stories/python-for-financial-machine-learning-at-union-investment/\">Python for Financial Machine Learning at Union Investment</a> <em>by Dr. Christian Mandery and Nikolas Gerlich</em></p></td>\n",
885+
" <td><p><a href=\"/success-stories/saving-the-world-with-open-data-and-python/\">Saving the world with Open Data and Python</a> <em>by James Baster</em></p></td>\n",
886886
" </tr>\n",
887887
" </tbody>\n",
888888
" </table>\n",
@@ -1263,7 +1263,6 @@
12631263
" async with aiohttp.ClientSession() as session:\n",
12641264
" async with session.get('http://python.org') as response:\n",
12651265
" text = await response.text()\n",
1266-
" some_cleaned_data = await function_name(text)\n",
12671266
" print('Test')\n",
12681267
" print(text)\n",
12691268
" \n",
@@ -1272,16 +1271,16 @@
12721271
},
12731272
{
12741273
"cell_type": "code",
1275-
"execution_count": 5,
1274+
"execution_count": 7,
12761275
"metadata": {},
12771276
"outputs": [
12781277
{
12791278
"data": {
12801279
"text/plain": [
1281-
"<coroutine object main at 0x7fc92cc5e170>"
1280+
"<coroutine object main at 0x7fbd17871050>"
12821281
]
12831282
},
1284-
"execution_count": 5,
1283+
"execution_count": 7,
12851284
"metadata": {},
12861285
"output_type": "execute_result"
12871286
}
@@ -1379,7 +1378,7 @@
13791378
},
13801379
{
13811380
"cell_type": "code",
1382-
"execution_count": 47,
1381+
"execution_count": 9,
13831382
"metadata": {},
13841383
"outputs": [],
13851384
"source": [
@@ -1422,7 +1421,7 @@
14221421
},
14231422
{
14241423
"cell_type": "code",
1425-
"execution_count": 48,
1424+
"execution_count": 10,
14261425
"metadata": {},
14271426
"outputs": [],
14281427
"source": [
@@ -1432,7 +1431,7 @@
14321431
},
14331432
{
14341433
"cell_type": "code",
1435-
"execution_count": 49,
1434+
"execution_count": 11,
14361435
"metadata": {},
14371436
"outputs": [],
14381437
"source": [
@@ -1442,7 +1441,7 @@
14421441
},
14431442
{
14441443
"cell_type": "code",
1445-
"execution_count": 50,
1444+
"execution_count": 12,
14461445
"metadata": {},
14471446
"outputs": [
14481447
{
@@ -1451,7 +1450,7 @@
14511450
"2"
14521451
]
14531452
},
1454-
"execution_count": 50,
1453+
"execution_count": 12,
14551454
"metadata": {},
14561455
"output_type": "execute_result"
14571456
}
@@ -1497,7 +1496,7 @@
14971496
},
14981497
{
14991498
"cell_type": "code",
1500-
"execution_count": 51,
1499+
"execution_count": 14,
15011500
"metadata": {},
15021501
"outputs": [],
15031502
"source": [
@@ -1550,7 +1549,7 @@
15501549
},
15511550
{
15521551
"cell_type": "code",
1553-
"execution_count": 52,
1552+
"execution_count": 15,
15541553
"metadata": {},
15551554
"outputs": [],
15561555
"source": [
@@ -1559,22 +1558,22 @@
15591558
},
15601559
{
15611560
"cell_type": "code",
1562-
"execution_count": 53,
1561+
"execution_count": 17,
15631562
"metadata": {},
15641563
"outputs": [
15651564
{
15661565
"data": {
15671566
"text/plain": [
1568-
"<title>understandingdata.com | 520: Web server is returning an unknown error</title>"
1567+
"bs4.element.Tag"
15691568
]
15701569
},
1571-
"execution_count": 53,
1570+
"execution_count": 17,
15721571
"metadata": {},
15731572
"output_type": "execute_result"
15741573
}
15751574
],
15761575
"source": [
1577-
"scraper.master_dict['https://understandingdata.com/']['Title']"
1576+
"type(scraper.master_dict['https://understandingdata.com/']['Title'])"
15781577
]
15791578
},
15801579
{
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# How To Extract The Text From Multiple Webpages In Python"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"When you're doing content analysis at scale, it can be very useful in knowing how to automatically extract the text from multiple webpages."
15+
]
16+
},
17+
{
18+
"cell_type": "markdown",
19+
"metadata": {},
20+
"source": [
21+
"-----------------------------------------------------------------"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": null,
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"Firstly we'll "
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 47,
36+
"metadata": {},
37+
"outputs": [],
38+
"source": [
39+
"class WebScraper(object):\n",
40+
" def __init__(self, urls):\n",
41+
" self.urls = urls\n",
42+
" # Global Place To Store The Data:\n",
43+
" self.all_data = []\n",
44+
" self.master_dict = {}\n",
45+
" # Run The Scraper:\n",
46+
" asyncio.run(self.main())\n",
47+
"\n",
48+
" async def fetch(self, session, url):\n",
49+
" try:\n",
50+
" async with session.get(url) as response:\n",
51+
" text = await response.text()\n",
52+
" return text, url\n",
53+
" except Exception as e:\n",
54+
" print(str(e))\n",
55+
"\n",
56+
" async def main(self):\n",
57+
" tasks = []\n",
58+
" headers = {\n",
59+
" \"user-agent\": \"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\"}\n",
60+
" async with aiohttp.ClientSession(headers=headers) as session:\n",
61+
" for url in self.urls:\n",
62+
" tasks.append(self.fetch(session, url))\n",
63+
"\n",
64+
" htmls = await asyncio.gather(*tasks)\n",
65+
" self.all_data.extend(htmls)\n",
66+
"\n",
67+
" # Storing the raw HTML data.\n",
68+
" for html in htmls:\n",
69+
" if html is not None:\n",
70+
" url = html[1]\n",
71+
" self.master_dict[url] = {'Raw Html': html[0]}\n",
72+
" else:\n",
73+
" continue\n"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": 48,
79+
"metadata": {},
80+
"outputs": [],
81+
"source": [
82+
"# 1. Create a list of URLs for our scraper to get the data for:\n",
83+
"urls = ['https://understandingdata.com/', 'http://twitter.com/']"
84+
]
85+
},
86+
{
87+
"cell_type": "code",
88+
"execution_count": 49,
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"# 2. Create the scraper class instance, this will automatically create a new event loop within the __init__ method:\n",
93+
"scraper = WebScraper(urls = urls)"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": 50,
99+
"metadata": {},
100+
"outputs": [
101+
{
102+
"data": {
103+
"text/plain": [
104+
"2"
105+
]
106+
},
107+
"execution_count": 50,
108+
"metadata": {},
109+
"output_type": "execute_result"
110+
}
111+
],
112+
"source": [
113+
"# 3. Notice how we have a list length of 2:\n",
114+
"len(scraper.all_data)"
115+
]
116+
},
117+
{
118+
"cell_type": "markdown",
119+
"metadata": {},
120+
"source": [
121+
"------------------------------------------------------------------------"
122+
]
123+
}
124+
],
125+
"metadata": {
126+
"kernelspec": {
127+
"display_name": "Python 3",
128+
"language": "python",
129+
"name": "python3"
130+
},
131+
"language_info": {
132+
"codemirror_mode": {
133+
"name": "ipython",
134+
"version": 3
135+
},
136+
"file_extension": ".py",
137+
"mimetype": "text/x-python",
138+
"name": "python",
139+
"nbconvert_exporter": "python",
140+
"pygments_lexer": "ipython3",
141+
"version": "3.7.9"
142+
}
143+
},
144+
"nbformat": 4,
145+
"nbformat_minor": 4
146+
}

0 commit comments

Comments
 (0)