Email Id Extractor Project from sites in Scrapy Python

m3uZael · Jan 28, 2024

Python:

# web scraping framework
import scrapy

# for regular expression
import re

# for selenium request
from scrapy_selenium import SeleniumRequest

# for link extraction
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

class EmailtrackSpider(scrapy.Spider):
    # name of spider
    name = 'emailtrack'

    # to have unique email ids
    uniqueemail = set()

    
    # and parse function is called
    def start_requests(self):
        yield SeleniumRequest(
            url="https://www.google.com",
            wait_time=3,
            screenshot=True,
            callback=self.parse,
            dont_filter=True
        )

    def parse(self, response):
            # this helps to get all links from source code
            links = LxmlLinkExtractor(allow=()).extract_links(response)

            # Finallinks contains links url
            Finallinks = [str(link.url) for link in links]

            # links list for url that may have email ids
            links = []

            # filtering and storing only needed url in links list
            # pages that are about us and contact us are the ones that have email ids
            for link in Finallinks:
                if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link):
                    links.append(link)

            # current page url also added because few sites have email ids on there main page
            links.append(str(response.url))



            # parse_link function is called for extracting email ids
            l = links[0]
            links.pop(0)

            # meta helps to transfer links list from parse to parse_link
            yield SeleniumRequest(
                url=l,
                wait_time=3,
                screenshot=True,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )


    def parse_link(self, response):

        # response.meta['links'] this helps to get links list
        links = response.meta['links']
        flag = 0

        # links that contains following bad words are discarded
        bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin']

        for word in bad_words:
            # if any bad word is found in the current page url
            # flag is assigned to 1
            if word in str(response.url):
                flag = 1
                break

        # if flag is 1 then no need to get email from
        # that url/page
        if (flag != 1):
            html_text = str(response.text)
            # regular expression used for email id
            email_list = re.findall('\w+@\w+\.{1}\w+', html_text)
            # set of email_list to get unique
            email_list = set(email_list)
            if (len(email_list) != 0):
                for i in email_list:
                    # adding email ids to final uniqueemail
                    self.uniqueemail.add(i)

        # parse_link function is called till
        # if condition satisfy
        # else move to parsed function
        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            yield SeleniumRequest(
                url=l,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )
        else:
            yield SeleniumRequest(
                url=response.url,
                callback=self.parsed,
                dont_filter=True
            )

    def parsed(self, response):
        # emails list of uniqueemail set
        emails = list(self.uniqueemail)
        finalemail = []

        for email in emails:
            # avoid garbage value by using '.in' and '.com'
            # and append email ids to finalemail
            if ('.in' in email or '.com' in email or 'info' in email or 'org' in email):

                finalemail.append(email)

        # final unique email ids from geeksforgeeks site
        print('\n'*2)
        print("Emails scraped", finalemail)
        print('\n'*2)

Welcome To Crax.Pro Forum!

Check our new Marketplace at Crax.Shop

Email Id Extractor Project from sites in Scrapy Python

Email Id Extractor Project from sites in Scrapy Python

m3uZael

Similar threads

Let's have some Entertainent :)

How to Make Money with Crax?

Resources