Creating a Webscraper using Pyscrap
Setting Up Our Environment Let’s start by updating our package lists and installing the necessary tools:
1
apt update
Now let’s install Python3, Python virtual environment support, and pip:
1
apt install -y python3 python3.10-venv python3-pip -y
Next, let’s install Scrapy, the Python framework we’ll use for web scraping:
1
pip install scrapy==2.12.0
Let’s create a new Scrapy project:
1
2
scrapy startproject webscraper
cd webscraper
To better visualize our project structure, let’s install the tree utility:
1
apt install -y tree
Scrapy requires spiders to be defined as Python classes. Let’s create a file named altoro_spider.py in the spiders directory:
1
2
3
4
5
6
7
8
cat > webscraper/spiders/altoro_spider.py <<EOF
import scrapy
class AltoroSpider(scrapy.Spider):
name = "altoro"
allowed_domains = ['demo.testfire.net']
start_urls = ['https://demo.testfire.net/login.jsp']
EOF
Next, we’ll add the main parsing method that extracts data from the initial page:
1
2
3
4
5
6
7
8
9
10
11
12
cat >> webscraper/spiders/altoro_spider.py <<EOF
def parse(self, response):
# Extract main navigation links
nav_links = response.css('a::attr(href)').getall()
# Extract main sections
sections = {
'personal': response.xpath('//a[contains(@href, "personal")]'),
'business': response.xpath('//a[contains(@href, "business")]'),
'inside': response.xpath('//a[contains(@href, "inside")]')
}
EOF
Next, we’ll add the method that handles the content pages:
1
2
3
4
5
6
7
8
9
10
cat >> webscraper/spiders/altoro_spider.py <<EOF
def parse_content_page(self, response):
yield {
'url': response.url,
'title': response.css('title::text').get(),
'content': response.css('div.content::text').getall(),
'links': response.css('a::attr(href)').getall()
}
EOF
Finally, we’ll add a helper method to process section links:
1
2
3
4
5
6
7
cat >> webscraper/spiders/altoro_spider.py <<EOF
def extract_section_links(self, section):
return [{'text': link.css('::text').get(),
'url': link.css('::attr(href)').get()}
for link in section]
EOF
Now let’s create a settings file to configure our scraper:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
cat > webscraper/settings.py <<EOF
# Scrapy settings for webscraper project
BOT_NAME = "altoro_scraper"
SPIDER_MODULES = ["webscraper.spiders"]
NEWSPIDER_MODULE = "webscraper.spiders"
# Respect the site's robots.txt
ROBOTSTXT_OBEY = True
# Add significant delays between requests to avoid overloading the server
DOWNLOAD_DELAY = 3
# Identify your scraper (always use a descriptive user agent)
USER_AGENT = "AltoroScraperBot (+https://yourwebsite.com)"
# Don't overload the demo site with concurrent requests
CONCURRENT_REQUESTS = 1
# Cache responses to reduce server load
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 86400
# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
EOF
Now we can run our web scraper to collect data:
1
2
3
cd ~
cd webscraper
scrapy crawl altoro -O altoro_structure.json
Once the crawl is complete, you can view the scraped data:
1
cat altoro_structure.json | jq
Let’s create our analysis script:
1
2
3
4
5
6
7
8
cat > analyze_scraped_data.py <<EOF
import json
from collections import Counter
# Load the scraped data
with open('altoro_structure.json', 'r') as f:
data = json.load(f)
EOF
Add code to count unique URLs:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
cat >> analyze_scraped_data.py <<EOF
# Count all unique URLs
all_urls = []
for item in data:
# Add the main URL of each page
if 'url' in item:
all_urls.append(item['url'])
# Add all links found on the page
if 'links' in item:
all_urls.extend(item['links'])
# Use a set to remove duplicates
unique_urls = set(all_urls)
print(f"Total unique URLs found: {len(unique_urls)}")
EOF
Now add code to categorize the content:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
cat >> analyze_scraped_data.py <<EOF
# Analyze content by page title
content_types = Counter()
for item in data:
if 'title' in item and isinstance(item.get('title'), str):
title = item['title'].lower()
# Categorize based on keywords in titles
if 'login' in title:
content_types['login'] += 1
elif 'personal' in title:
content_types['personal'] += 1
elif 'business' in title:
content_types['business'] += 1
elif 'about' in title or 'inside' in title:
content_types['about'] += 1
else:
content_types['other'] += 1
print("\nContent type distribution:")
for content_type, count in content_types.items():
print(f"- {content_type}: {count}")
EOF
Finally, add code to extract financial terminology:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
cat >> analyze_scraped_data.py <<EOF
# Define banking/financial terms to look for
financial_terms = set()
banking_terms = ['account', 'banking', 'credit', 'loan', 'mortgage',
'investment', 'transfer', 'deposit', 'withdrawal',
'checking', 'savings', 'retirement', 'insurance',
'lending', 'cards', 'financial', 'business', 'personal']
# Search URLs for terms
for item in data:
if 'url' in item and isinstance(item.get('url'), str):
url_text = item['url'].lower()
for term in banking_terms:
if term in url_text:
financial_terms.add(term)
EOF
Extract Terms from Titles
1
2
3
4
5
6
7
8
9
cat >> analyze_scraped_data.py <<EOF
# Search page titles for terms
if 'title' in item and isinstance(item.get('title'), str):
title_text = item['title'].lower()
for term in banking_terms:
if term in title_text:
financial_terms.add(term)
EOF
Extract Terms from Content
1
2
3
4
5
6
7
8
9
10
11
cat >> analyze_scraped_data.py <<EOF
# Search content (if available) for terms
if 'content' in item and isinstance(item.get('content'), list):
for content in item['content']:
if isinstance(content, str):
content_text = content.lower()
for term in banking_terms:
if term in content_text:
financial_terms.add(term)
EOF
Extract Terms from Navigation Sections
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
cat >> analyze_scraped_data.py <<EOF
# Search navigation sections
if 'main_sections' in item and isinstance(item.get('main_sections'), dict):
# Check section names (like "personal", "business")
for section_name, links in item['main_sections'].items():
section_text = section_name.lower()
for term in banking_terms:
if term in section_text:
financial_terms.add(term)
# Check links within sections
if isinstance(links, list):
for link in links:
if isinstance(link, dict):
# Check link text
if 'text' in link and isinstance(link['text'], str):
link_text = link['text'].lower()
for term in banking_terms:
if term in link_text:
financial_terms.add(term)
EOF
Print the Results
1
2
3
4
5
6
7
8
9
10
cat >> analyze_scraped_data.py <<EOF
# Display all financial terms found
print("\nFinancial terms found:")
if financial_terms:
for term in sorted(financial_terms):
print(f"- {term}")
else:
print("No financial terms found.")
EOF