Creating a Webscraper using Pyscrap
Post

Creating a Webscraper using Pyscrap

Creating a Webscraper using Pyscrap

Setting Up Our Environment Let’s start by updating our package lists and installing the necessary tools:

1
apt update

Now let’s install Python3, Python virtual environment support, and pip:

1
apt install -y python3 python3.10-venv python3-pip -y

Next, let’s install Scrapy, the Python framework we’ll use for web scraping:

1
pip install scrapy==2.12.0

Let’s create a new Scrapy project:

1
2
scrapy startproject webscraper
cd webscraper

To better visualize our project structure, let’s install the tree utility:

1
apt install -y tree

Scrapy requires spiders to be defined as Python classes. Let’s create a file named altoro_spider.py in the spiders directory:

1
2
3
4
5
6
7
8
cat > webscraper/spiders/altoro_spider.py <<EOF
import scrapy

class AltoroSpider(scrapy.Spider):
    name = "altoro"
    allowed_domains = ['demo.testfire.net']
    start_urls = ['https://demo.testfire.net/login.jsp']
EOF

Next, we’ll add the main parsing method that extracts data from the initial page:

1
2
3
4
5
6
7
8
9
10
11
12
cat >> webscraper/spiders/altoro_spider.py <<EOF
    def parse(self, response):
        # Extract main navigation links
        nav_links = response.css('a::attr(href)').getall()

        # Extract main sections
        sections = {
            'personal': response.xpath('//a[contains(@href, "personal")]'),
            'business': response.xpath('//a[contains(@href, "business")]'),
            'inside': response.xpath('//a[contains(@href, "inside")]')
        }
EOF

Next, we’ll add the method that handles the content pages:

1
2
3
4
5
6
7
8
9
10
cat >> webscraper/spiders/altoro_spider.py <<EOF

    def parse_content_page(self, response):
        yield {
            'url': response.url,
            'title': response.css('title::text').get(),
            'content': response.css('div.content::text').getall(),
            'links': response.css('a::attr(href)').getall()
        }
EOF

Finally, we’ll add a helper method to process section links:

1
2
3
4
5
6
7
cat >> webscraper/spiders/altoro_spider.py <<EOF

    def extract_section_links(self, section):
        return [{'text': link.css('::text').get(), 
                 'url': link.css('::attr(href)').get()} 
                for link in section]
EOF

Now let’s create a settings file to configure our scraper:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
cat > webscraper/settings.py <<EOF
# Scrapy settings for webscraper project

BOT_NAME = "altoro_scraper"

SPIDER_MODULES = ["webscraper.spiders"]
NEWSPIDER_MODULE = "webscraper.spiders"

# Respect the site's robots.txt
ROBOTSTXT_OBEY = True

# Add significant delays between requests to avoid overloading the server
DOWNLOAD_DELAY = 3

# Identify your scraper (always use a descriptive user agent)
USER_AGENT = "AltoroScraperBot (+https://yourwebsite.com)"

# Don't overload the demo site with concurrent requests
CONCURRENT_REQUESTS = 1

# Cache responses to reduce server load
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 86400

# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
EOF

Now we can run our web scraper to collect data:

1
2
3
cd ~
cd webscraper
scrapy crawl altoro -O altoro_structure.json

Once the crawl is complete, you can view the scraped data:

1
cat altoro_structure.json | jq

Let’s create our analysis script:

1
2
3
4
5
6
7
8
cat > analyze_scraped_data.py <<EOF
import json
from collections import Counter

# Load the scraped data
with open('altoro_structure.json', 'r') as f:
    data = json.load(f)
EOF

Add code to count unique URLs:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
cat >> analyze_scraped_data.py <<EOF

# Count all unique URLs
all_urls = []
for item in data:
    # Add the main URL of each page
    if 'url' in item:
        all_urls.append(item['url'])
    # Add all links found on the page
    if 'links' in item:
        all_urls.extend(item['links'])

# Use a set to remove duplicates
unique_urls = set(all_urls)
print(f"Total unique URLs found: {len(unique_urls)}")
EOF

Now add code to categorize the content:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
cat >> analyze_scraped_data.py <<EOF

# Analyze content by page title
content_types = Counter()
for item in data:
    if 'title' in item and isinstance(item.get('title'), str):
        title = item['title'].lower()
        # Categorize based on keywords in titles
        if 'login' in title:
            content_types['login'] += 1
        elif 'personal' in title:
            content_types['personal'] += 1
        elif 'business' in title:
            content_types['business'] += 1
        elif 'about' in title or 'inside' in title:
            content_types['about'] += 1
        else:
            content_types['other'] += 1

print("\nContent type distribution:")
for content_type, count in content_types.items():
    print(f"- {content_type}: {count}")
EOF

Finally, add code to extract financial terminology:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
cat >> analyze_scraped_data.py <<EOF

# Define banking/financial terms to look for
financial_terms = set()
banking_terms = ['account', 'banking', 'credit', 'loan', 'mortgage', 
                'investment', 'transfer', 'deposit', 'withdrawal',
                'checking', 'savings', 'retirement', 'insurance',
                'lending', 'cards', 'financial', 'business', 'personal']

# Search URLs for terms
for item in data:
    if 'url' in item and isinstance(item.get('url'), str):
        url_text = item['url'].lower()
        for term in banking_terms:
            if term in url_text:
                financial_terms.add(term)
EOF

Extract Terms from Titles

1
2
3
4
5
6
7
8
9
cat >> analyze_scraped_data.py <<EOF

    # Search page titles for terms
    if 'title' in item and isinstance(item.get('title'), str):
        title_text = item['title'].lower()
        for term in banking_terms:
            if term in title_text:
                financial_terms.add(term)
EOF

Extract Terms from Content

1
2
3
4
5
6
7
8
9
10
11
cat >> analyze_scraped_data.py <<EOF

    # Search content (if available) for terms
    if 'content' in item and isinstance(item.get('content'), list):
        for content in item['content']:
            if isinstance(content, str):
                content_text = content.lower()
                for term in banking_terms:
                    if term in content_text:
                        financial_terms.add(term)
EOF

Extract Terms from Navigation Sections

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
cat >> analyze_scraped_data.py <<EOF

    # Search navigation sections
    if 'main_sections' in item and isinstance(item.get('main_sections'), dict):
        # Check section names (like "personal", "business")
        for section_name, links in item['main_sections'].items():
            section_text = section_name.lower()
            for term in banking_terms:
                if term in section_text:
                    financial_terms.add(term)

            # Check links within sections
            if isinstance(links, list):
                for link in links:
                    if isinstance(link, dict):
                        # Check link text
                        if 'text' in link and isinstance(link['text'], str):
                            link_text = link['text'].lower()
                            for term in banking_terms:
                                if term in link_text:
                                    financial_terms.add(term)
EOF

Print the Results

1
2
3
4
5
6
7
8
9
10
cat >> analyze_scraped_data.py <<EOF

# Display all financial terms found
print("\nFinancial terms found:")
if financial_terms:
    for term in sorted(financial_terms):
        print(f"- {term}")
else:
    print("No financial terms found.")
EOF