Skip to content

Commit d864e9a

Browse files
committed
Add Lesson 3 BeautifulSoup Web Scraping With Static & SSR Sites
1 parent 96d4951 commit d864e9a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+3514
-0
lines changed

003-beautiful-soup-with-static-site-and-server-side-rendered-web-scraping/website/301.txt

Whitespace-only changes.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
title: Not Found
3+
hero: This does not exist
4+
permalink: /404.html
5+
sitemap: false
6+
---
7+
8+
<p>This does not exist</p>
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Use the official Ruby image from the Docker Hub
2+
FROM ruby:3.0.0
3+
4+
# Install the necessary libraries
5+
RUN apt-get update -qq && apt-get install -y build-essential libpq-dev
6+
7+
# Install Jekyll and Bundler
8+
RUN gem install jekyll
9+
RUN gem install bundler:1.17.3
10+
11+
12+
# Create a new directory for your Jekyll site
13+
RUN mkdir /usr/src/app
14+
15+
# Change to the new directory
16+
WORKDIR /usr/src/app
17+
18+
# Copy Gemfile and Gemfile.lock
19+
COPY Gemfile* ./
20+
21+
# Install the Gems
22+
RUN bundle install
23+
24+
# Copy the rest of your Jekyll site to the image
25+
COPY . .
26+
27+
# Make port 4000 available to the world outside this container
28+
EXPOSE 3000
29+
30+
# Execute Jekyll serve command
31+
CMD ["bundle", "exec", "jekyll", "serve", "--host", "0.0.0.0", "--port", "3000"]
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
source 'https://rubygems.org'
2+
3+
gem 'jekyll', '4.2.1'
4+
5+
group :jekyll_plugins do
6+
gem 'jekyll-seo-tag', '2.6.1'
7+
gem 'jekyll-sitemap', '1.3.1'
8+
end
9+
10+
gem 'webrick', '1.8.1'
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
GEM
2+
remote: https://rubygems.org/
3+
specs:
4+
addressable (2.8.0)
5+
public_suffix (>= 2.0.2, < 5.0)
6+
colorator (1.1.0)
7+
concurrent-ruby (1.1.9)
8+
em-websocket (0.5.3)
9+
eventmachine (>= 0.12.9)
10+
http_parser.rb (~> 0)
11+
eventmachine (1.2.7)
12+
eventmachine (1.2.7-x64-mingw32)
13+
ffi (1.15.5)
14+
ffi (1.15.5-x64-mingw32)
15+
forwardable-extended (2.6.0)
16+
http_parser.rb (0.8.0)
17+
i18n (1.8.11)
18+
concurrent-ruby (~> 1.0)
19+
jekyll (4.2.1)
20+
addressable (~> 2.4)
21+
colorator (~> 1.0)
22+
em-websocket (~> 0.5)
23+
i18n (~> 1.0)
24+
jekyll-sass-converter (~> 2.0)
25+
jekyll-watch (~> 2.0)
26+
kramdown (~> 2.3)
27+
kramdown-parser-gfm (~> 1.0)
28+
liquid (~> 4.0)
29+
mercenary (~> 0.4.0)
30+
pathutil (~> 0.9)
31+
rouge (~> 3.0)
32+
safe_yaml (~> 1.0)
33+
terminal-table (~> 2.0)
34+
jekyll-sass-converter (2.1.0)
35+
sassc (> 2.0.1, < 3.0)
36+
jekyll-seo-tag (2.6.1)
37+
jekyll (>= 3.3, < 5.0)
38+
jekyll-sitemap (1.3.1)
39+
jekyll (>= 3.7, < 5.0)
40+
jekyll-watch (2.2.1)
41+
listen (~> 3.0)
42+
kramdown (2.3.1)
43+
rexml
44+
kramdown-parser-gfm (1.1.0)
45+
kramdown (~> 2.0)
46+
liquid (4.0.3)
47+
listen (3.7.1)
48+
rb-fsevent (~> 0.10, >= 0.10.3)
49+
rb-inotify (~> 0.9, >= 0.9.10)
50+
mercenary (0.4.0)
51+
pathutil (0.16.2)
52+
forwardable-extended (~> 2.6)
53+
public_suffix (4.0.6)
54+
rb-fsevent (0.11.0)
55+
rb-inotify (0.10.1)
56+
ffi (~> 1.0)
57+
rexml (3.2.5)
58+
rouge (3.27.0)
59+
safe_yaml (1.0.5)
60+
sassc (2.4.0)
61+
ffi (~> 1.9)
62+
sassc (2.4.0-x64-mingw32)
63+
ffi (~> 1.9)
64+
terminal-table (2.0.0)
65+
unicode-display_width (~> 1.1, >= 1.1.1)
66+
unicode-display_width (1.8.0)
67+
webrick (1.8.1)
68+
69+
PLATFORMS
70+
ruby
71+
x64-mingw32
72+
73+
DEPENDENCIES
74+
jekyll (= 4.2.1)
75+
jekyll-seo-tag (= 2.6.1)
76+
jekyll-sitemap (= 1.3.1)
77+
webrick (= 1.8.1)
78+
79+
BUNDLED WITH
80+
1.17.3
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2016 CloudCannon
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
dev:
2+
bundle exec jekyll serve --drafts --livereload
3+
4+
build:
5+
bundle exec jekyll build
6+
7+
install:
8+
gem install bundler jekyll && bundle update
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Web Scraping Workshop Website Madhacks 2023
2+
3+
This is heavily based upon [CloudCannon/fur-jekyll-template](https://github.com/CloudCannon/fur-jekyll-template) modified for use for this workshop! This does **not** have a database or any backend to it at all, the main purpose of this website in the workshop is to teach static site web scraping with Beautiful Soup.
4+
5+
Reviews from [this dataset](https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
* Fix search to actually work
2+
* Categorical search (category.html) with sliders and such would be nice if filtering there also worked
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# ----
2+
# Site
3+
4+
title: Fur
5+
google_analytics_key:
6+
google_maps_javascript_api_key:
7+
8+
# Values for the jekyll-seo-tag gem (https://github.com/jekyll/jekyll-seo-tag)
9+
logo: /siteicon.png
10+
description: Fur t-shirt store
11+
author:
12+
name: fur
13+
14+
twitter: fur # twitter username without the @ symbol
15+
phone: "+1 23456789"
16+
social:
17+
name: Fur Template
18+
links:
19+
- https://github.com/CloudCannon/fur-jekyll-template
20+
21+
# -----
22+
# Build
23+
timezone: Etc/UTC
24+
25+
collections:
26+
staff_members:
27+
output: false
28+
products:
29+
output: true
30+
31+
permalink: pretty
32+
33+
defaults:
34+
-
35+
scope:
36+
path: ""
37+
type: "products"
38+
values:
39+
layout: "product"
40+
-
41+
scope:
42+
path: ""
43+
values:
44+
layout: "page"
45+
-
46+
scope:
47+
path: "index.html"
48+
values:
49+
layout: "default"
50+
51+
jekyll-archives:
52+
enabled:
53+
- categories
54+
55+
plugins:
56+
- jekyll-sitemap
57+
- jekyll-seo-tag
58+
59+
exclude:
60+
- Gemfile
61+
- Gemfile.lock
62+
- README.md
63+
- LICENCE
64+
65+
social_icons:
66+
- Email
67+
- Facebook
68+
- Google Plus
69+
- Instagram
70+
- LinkedIn
71+
- Pinterest
72+
- Tumblr
73+
- Twitter
74+
- YouTube
75+
- RSS
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
- name: Products
2+
link: /
3+
new_window: false
4+
highlight: false
5+
- name: Our Story
6+
link: /about/
7+
new_window: false
8+
highlight: false
9+
- name: Contact
10+
link: /contact/
11+
new_window: false
12+
highlight: false
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="Capa_1" x="0px" y="0px" viewBox="0 0 48.839 48.839" style="enable-background:new 0 0 48.839 48.839;" xml:space="preserve" width="512px" height="512px">
2+
<g>
3+
<path d="M39.041,36.843c2.054,3.234,3.022,4.951,3.022,6.742c0,3.537-2.627,5.252-6.166,5.252 c-1.56,0-2.567-0.002-5.112-1.326c0,0-1.649-1.509-5.508-1.354c-3.895-0.154-5.545,1.373-5.545,1.373 c-2.545,1.323-3.516,1.309-5.074,1.309c-3.539,0-6.168-1.713-6.168-5.252c0-1.791,0.971-3.506,3.024-6.742 c0,0,3.881-6.445,7.244-9.477c2.43-2.188,5.973-2.18,5.973-2.18h1.093v-0.001c0,0,3.698-0.009,5.976,2.181 C35.059,30.51,39.041,36.844,39.041,36.843z M16.631,20.878c3.7,0,6.699-4.674,6.699-10.439S20.331,0,16.631,0 S9.932,4.674,9.932,10.439S12.931,20.878,16.631,20.878z M10.211,30.988c2.727-1.259,3.349-5.723,1.388-9.971 s-5.761-6.672-8.488-5.414s-3.348,5.723-1.388,9.971C3.684,29.822,7.484,32.245,10.211,30.988z M32.206,20.878 c3.7,0,6.7-4.674,6.7-10.439S35.906,0,32.206,0s-6.699,4.674-6.699,10.439C25.507,16.204,28.506,20.878,32.206,20.878z M45.727,15.602c-2.728-1.259-6.527,1.165-8.488,5.414s-1.339,8.713,1.389,9.972c2.728,1.258,6.527-1.166,8.488-5.414 S48.455,16.861,45.727,15.602z" fill="#FFFFFF"/>
4+
</g>
5+
<g>
6+
</g>
7+
<g>
8+
</g>
9+
<g>
10+
</g>
11+
<g>
12+
</g>
13+
<g>
14+
</g>
15+
<g>
16+
</g>
17+
<g>
18+
</g>
19+
<g>
20+
</g>
21+
<g>
22+
</g>
23+
<g>
24+
</g>
25+
<g>
26+
</g>
27+
<g>
28+
</g>
29+
<g>
30+
</g>
31+
<g>
32+
</g>
33+
<g>
34+
</g>
35+
</svg>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<nav>
2+
<a class="nav-toggle" id="open-nav" href="#">&#9776;</a>
3+
{% for link in site.data.navigation %}
4+
{% assign class = "" %}
5+
{% if link.highlight %}
6+
{% assign class = class | append: " highlight" %}
7+
{% endif %}
8+
9+
{% if link.link == page.url %}
10+
{% assign class = class | append: " active" %}
11+
{% endif %}
12+
<a href="{% include relative-src.html src=link.link %}" class="{{ class }}" {% if link.new_window %}target="_blank"{% endif %}>{{ link.name }}</a>
13+
{% endfor %}
14+
<a class="editor-link btn" href="cloudcannon:collections/_data/navigation.yml" class="btn" title="Edit navigation"><strong>&#9998;</strong></a>
15+
<a href="#" class="cart snipcart-checkout"><i class="material-icons">shopping_cart</i></a>
16+
</nav>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<div class="styles">
2+
{% for style in include.product.styles %}
3+
<div class="style" data-item-id="{{ forloop.index }}" {% unless forloop.first %}style="display: none"{% endunless %}>
4+
<a href="{{site.baseurl}}{{ include.product.url }}">
5+
<img src="{{ site.baseurl }}{{ style.image }}">
6+
</a>
7+
</div>
8+
{% endfor %}
9+
10+
<div class="style-picker">
11+
{% for style in include.product.styles %}
12+
<div style="background-color: {{ style.color }}" data-item-id="{{ forloop.index }}"></div>
13+
{% endfor %}
14+
</div>
15+
</div>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{% assign prefix = include.src | slice: 0, 2 %}{% assign protocol = include.src | slice: 0, 4 %}{% unless protocol == 'http' or prefix == "//" %}{{ site.baseurl }}{% endunless %}{{ include.src }}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{% assign colors = "" %}
2+
{% for style in include.product.styles %}
3+
{% assign colors = colors | append: '|' | append: style.name %}
4+
{% endfor %}
5+
{% assign colors = colors | remove_first: '|' %}
6+
<button class="snipcart-add-item"
7+
data-item-name="{{ include.product.name }}"
8+
data-item-id="{{ include.product.sku }}"
9+
data-item-image="{{ include.product.styles.first.image }}"
10+
data-item-description="{{ include.product.description }}"
11+
data-item-custom1-name="Size"
12+
data-item-custom1-options="{{ include.product.sizes | join: '|' }}"
13+
data-item-custom1-value="Medium"
14+
data-item-custom2-name="Color"
15+
data-item-custom2-options="{{ colors }}"
16+
data-item-url="{{ include.product.url }}"
17+
data-item-price="{{ include.product.price }}">
18+
{% if include.text %}{{ include.text }}{% else %}Add to cart{% endif %}
19+
</button>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{% case include.icon %}
2+
{% when "Email" %}
3+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M20 4H4c-1.1 0-1.99.9-1.99 2L2 18c0 1.1.9 2 2 2h16c1.1 0 2-.9 2-2V6c0-1.1-.9-2-2-2zm0 4l-8 5-8-5V6l8 5 8-5v2z"/></svg>
4+
{% when "Facebook" %}
5+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M19,4V7H17A1,1 0 0,0 16,8V10H19V13H16V20H13V13H11V10H13V7.5C13,5.56 14.57,4 16.5,4M20,2H4A2,2 0 0,0 2,4V20A2,2 0 0,0 4,22H20A2,2 0 0,0 22,20V4C22,2.89 21.1,2 20,2Z" /></svg>
6+
{% when "Facebook2" %}
7+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 155.139 155.139"><path d="M89.584 155.14V84.377h23.742l3.562-27.585H89.584v-17.61c0-7.983 2.208-13.424 13.67-13.424l14.595-.007V1.08C115.324.752 106.66 0 96.576 0 75.52 0 61.104 12.853 61.104 36.452v20.34H37.29V84.38h23.814v70.76h28.48z" fill="#010002"/></svg>
8+
{% when "Instagram" %}
9+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M7.8,2H16.2C19.4,2 22,4.6 22,7.8V16.2A5.8,5.8 0 0,1 16.2,22H7.8C4.6,22 2,19.4 2,16.2V7.8A5.8,5.8 0 0,1 7.8,2M7.6,4A3.6,3.6 0 0,0 4,7.6V16.4C4,18.39 5.61,20 7.6,20H16.4A3.6,3.6 0 0,0 20,16.4V7.6C20,5.61 18.39,4 16.4,4H7.6M17.25,5.5A1.25,1.25 0 0,1 18.5,6.75A1.25,1.25 0 0,1 17.25,8A1.25,1.25 0 0,1 16,6.75A1.25,1.25 0 0,1 17.25,5.5M12,7A5,5 0 0,1 17,12A5,5 0 0,1 12,17A5,5 0 0,1 7,12A5,5 0 0,1 12,7M12,9A3,3 0 0,0 9,12A3,3 0 0,0 12,15A3,3 0 0,0 15,12A3,3 0 0,0 12,9Z" /></svg>
10+
{% when "LinkedIn" %}
11+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M19,19H16V13.7A1.5,1.5 0 0,0 14.5,12.2A1.5,1.5 0 0,0 13,13.7V19H10V10H13V11.2C13.5,10.36 14.59,9.8 15.5,9.8A3.5,3.5 0 0,1 19,13.3M6.5,8.31C5.5,8.31 4.69,7.5 4.69,6.5A1.81,1.81 0 0,1 6.5,4.69C7.5,4.69 8.31,5.5 8.31,6.5A1.81,1.81 0 0,1 6.5,8.31M8,19H5V10H8M20,2H4C2.89,2 2,2.89 2,4V20A2,2 0 0,0 4,22H20A2,2 0 0,0 22,20V4C22,2.89 21.1,2 20,2Z" /></svg>
12+
{% when "Pinterest" %}
13+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M13,16.2C12.2,16.2 11.43,15.86 10.88,15.28L9.93,18.5L9.86,18.69L9.83,18.67C9.64,19 9.29,19.2 8.9,19.2C8.29,19.2 7.8,18.71 7.8,18.1C7.8,18.05 7.81,18 7.81,17.95H7.8L7.85,17.77L9.7,12.21C9.7,12.21 9.5,11.59 9.5,10.73C9.5,9 10.42,8.5 11.16,8.5C11.91,8.5 12.58,8.76 12.58,9.81C12.58,11.15 11.69,11.84 11.69,12.81C11.69,13.55 12.29,14.16 13.03,14.16C15.37,14.16 16.2,12.4 16.2,10.75C16.2,8.57 14.32,6.8 12,6.8C9.68,6.8 7.8,8.57 7.8,10.75C7.8,11.42 8,12.09 8.34,12.68C8.43,12.84 8.5,13 8.5,13.2A1,1 0 0,1 7.5,14.2C7.13,14.2 6.79,14 6.62,13.7C6.08,12.81 5.8,11.79 5.8,10.75C5.8,7.47 8.58,4.8 12,4.8C15.42,4.8 18.2,7.47 18.2,10.75C18.2,13.37 16.57,16.2 13,16.2M20,2H4C2.89,2 2,2.89 2,4V20A2,2 0 0,0 4,22H20A2,2 0 0,0 22,20V4C22,2.89 21.1,2 20,2Z" /></svg>
14+
{% when "Tumblr" %}
15+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M16,11H13V14.9C13,15.63 13.14,16 14.1,16H16V19C16,19 14.97,19.1 13.9,19.1C11.25,19.1 10,17.5 10,15.7V11H8V8.2C10.41,8 10.62,6.16 10.8,5H13V8H16M20,2H4C2.89,2 2,2.89 2,4V20A2,2 0 0,0 4,22H20A2,2 0 0,0 22,20V4C22,2.89 21.1,2 20,2Z" /></svg>
16+
{% when "Twitter" %}
17+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M22.46,6C21.69,6.35 20.86,6.58 20,6.69C20.88,6.16 21.56,5.32 21.88,4.31C21.05,4.81 20.13,5.16 19.16,5.36C18.37,4.5 17.26,4 16,4C13.65,4 11.73,5.92 11.73,8.29C11.73,8.63 11.77,8.96 11.84,9.27C8.28,9.09 5.11,7.38 3,4.79C2.63,5.42 2.42,6.16 2.42,6.94C2.42,8.43 3.17,9.75 4.33,10.5C3.62,10.5 2.96,10.3 2.38,10C2.38,10 2.38,10 2.38,10.03C2.38,12.11 3.86,13.85 5.82,14.24C5.46,14.34 5.08,14.39 4.69,14.39C4.42,14.39 4.15,14.36 3.89,14.31C4.43,16 6,17.26 7.89,17.29C6.43,18.45 4.58,19.13 2.56,19.13C2.22,19.13 1.88,19.11 1.54,19.07C3.44,20.29 5.7,21 8.12,21C16,21 20.33,14.46 20.33,8.79C20.33,8.6 20.33,8.42 20.32,8.23C21.16,7.63 21.88,6.87 22.46,6Z" /></svg>
18+
{% when "YouTube" %}
19+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M10,16.5V7.5L16,12M20,4.4C19.4,4.2 15.7,4 12,4C8.3,4 4.6,4.19 4,4.38C2.44,4.9 2,8.4 2,12C2,15.59 2.44,19.1 4,19.61C4.6,19.81 8.3,20 12,20C15.7,20 19.4,19.81 20,19.61C21.56,19.1 22,15.59 22,12C22,8.4 21.56,4.91 20,4.4Z" /></svg>
20+
{% when "RSS" %}
21+
<svg fill="#000000" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M0 0h24v24H0z" fill="none"/><circle cx="6.18" cy="17.82" r="2.18"/><path d="M4 4.44v2.83c7.03 0 12.73 5.7 12.73 12.73h2.83c0-8.59-6.97-15.56-15.56-15.56zm0 5.66v2.83c3.9 0 7.07 3.17 7.07 7.07h2.83c0-5.47-4.43-9.9-9.9-9.9z"/></svg>
22+
{% endcase %}

0 commit comments

Comments
 (0)