Email Id Extractor Project from sites in Scrapy Python | Software | Crax

Welcome To Crax.Pro Forum!

Check our new Marketplace at Crax.Shop

   Login! SignUp Now!
  • We are in solidarity with our brothers and sisters in Palestine. Free Palestine. To learn more visit this Page

  • Crax.Pro domain has been taken down!

    Alternatives: Craxpro.io | Craxpro.com

Email Id Extractor Project from sites in Scrapy Python

Email Id Extractor Project from sites in Scrapy Python

LV
0
 

m3uZael

Member
Joined
Jan 25, 2024
Threads
11
Likes
0
Awards
2
Credits
935©
Cash
0$
Python:
# web scraping framework
import scrapy

# for regular expression
import re

# for selenium request
from scrapy_selenium import SeleniumRequest

# for link extraction
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

class EmailtrackSpider(scrapy.Spider):
    # name of spider
    name = 'emailtrack'

    # to have unique email ids
    uniqueemail = set()

    
    # and parse function is called
    def start_requests(self):
        yield SeleniumRequest(
            url="https://www.google.com",
            wait_time=3,
            screenshot=True,
            callback=self.parse,
            dont_filter=True
        )

    def parse(self, response):
            # this helps to get all links from source code
            links = LxmlLinkExtractor(allow=()).extract_links(response)

            # Finallinks contains links url
            Finallinks = [str(link.url) for link in links]

            # links list for url that may have email ids
            links = []

            # filtering and storing only needed url in links list
            # pages that are about us and contact us are the ones that have email ids
            for link in Finallinks:
                if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link):
                    links.append(link)

            # current page url also added because few sites have email ids on there main page
            links.append(str(response.url))



            # parse_link function is called for extracting email ids
            l = links[0]
            links.pop(0)

            # meta helps to transfer links list from parse to parse_link
            yield SeleniumRequest(
                url=l,
                wait_time=3,
                screenshot=True,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )


    def parse_link(self, response):

        # response.meta['links'] this helps to get links list
        links = response.meta['links']
        flag = 0

        # links that contains following bad words are discarded
        bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin']

        for word in bad_words:
            # if any bad word is found in the current page url
            # flag is assigned to 1
            if word in str(response.url):
                flag = 1
                break

        # if flag is 1 then no need to get email from
        # that url/page
        if (flag != 1):
            html_text = str(response.text)
            # regular expression used for email id
            email_list = re.findall('\w+@\w+\.{1}\w+', html_text)
            # set of email_list to get unique
            email_list = set(email_list)
            if (len(email_list) != 0):
                for i in email_list:
                    # adding email ids to final uniqueemail
                    self.uniqueemail.add(i)

        # parse_link function is called till
        # if condition satisfy
        # else move to parsed function
        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            yield SeleniumRequest(
                url=l,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )
        else:
            yield SeleniumRequest(
                url=response.url,
                callback=self.parsed,
                dont_filter=True
            )

    def parsed(self, response):
        # emails list of uniqueemail set
        emails = list(self.uniqueemail)
        finalemail = []

        for email in emails:
            # avoid garbage value by using '.in' and '.com'
            # and append email ids to finalemail
            if ('.in' in email or '.com' in email or 'info' in email or 'org' in email):

                finalemail.append(email)

        # final unique email ids from geeksforgeeks site
        print('\n'*2)
        print("Emails scraped", finalemail)
        print('\n'*2)
 

Create an account or login to comment

You must be a member in order to leave a comment

Create account

Create an account on our community. It's easy!

Log in

Already have an account? Log in here.

Similar threads

Top Bottom