code snipped - Devarch Digital

				
					import re
import pandas as pd
import urllib.parse
import xlsxwriter
import urllib.request
import time
import dns.resolver  
import socket

emailRegex = re.compile(r'''
#example :
#something-.+_@somedomain.com
(
([a-zA-Z0-9_.+]+
@
[a-zA-Z0-9_.+]+)
)
''', re.VERBOSE)

# Email validation function
def is_valid_email(email):
    # Syntax validation
    if not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
        return False
    
    # Check if the email contains more than 8 digits
    if len(re.findall(r'\d', email)) > 8:
        return False
    
    # Extract domain from email
    domain = email.split('@')[1]
    
    # Check against popular email providers
    popular_providers = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'aol.com']
    if domain.lower() in popular_providers:
        return True
    
    image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp']
    for ext in image_extensions:
        if email.lower().endswith(ext):
            return False

    try:
        # DNS MX record lookup
        dns.resolver.resolve(domain, 'MX')
        return True
    except (dns.resolver.NXDOMAIN, dns.resolver.NoAnswer):
        pass
    except Exception as e:
        print(f"DNS resolver error: {e}")
        pass
    
    try:
        # SPF record check
        dns.resolver.resolve(domain, 'TXT')
        spf_record = dns.resolver.resolve(domain, 'TXT').response.answer[0].to_text()
        if 'v=spf1' in spf_record.lower():
            return True
        else:
            print("SPF record not found")
            pass
    except Exception as e:
        print(f"SPF record error: {e}")
        pass
    
    try:
        # DKIM record check
        dns.resolver.resolve(f'_domainkey.{domain}', 'TXT')
        return True
    except Exception as e:
        print(f"DKIM record error: {e}")
        pass
    
    try:
        # SMTP verification
        socket.getaddrinfo(domain, 25, socket.AF_INET, socket.SOCK_STREAM)
        return True
    except Exception as e:
        print(f"SMTP verification error: {e}")
        pass
    print('No hope for this email , it can t be checked ')
    return False

# Dictionary to store emails by website
emails_by_website = {}

# Extacting Emails
def extractEmailsFromUrlText(urlText, website):
    extractedEmail = emailRegex.findall(urlText)
    for email in extractedEmail:
        email = email[0].lower() 
        print("--------------------------------")
        print(email)
        if is_valid_email(email):  # Validate the email address
            if website in emails_by_website:
                emails_by_website[website].add(email)  # Add the email to the set
            else:
                emails_by_website[website] = {email}  # Create a new set for the website

# HtmlPage Read Func

def htmlPageRead(url, i):
    print(url)
    
    # Ensure URL has a scheme (http or https)
    if not url.startswith(('http://', 'https://')):
        url = 'http://' + url  # Assuming http if no scheme is provided

    try:
        start = time.time()
        headers = {'User-Agent': 'Mozilla/5.0'}
        request = urllib.request.Request(url, None, headers)
        response = urllib.request.urlopen(request)
        urlHtmlPageRead = response.read()
        urlText = urlHtmlPageRead.decode('utf-8')  # Specify UTF-8 encoding
        print("%s.%s\tFetched in : %s" % (i, url, (time.time() - start)))
        return urlText
    except Exception as e:  # Catch the specific exception and print it
        print(f"Error fetching {url}: {e}")
        return None

# EmailsLeechFunction
def emailsLeechFunc(url, i):
    mainUrlText = htmlPageRead(url, i) 
    print(mainUrlText)
    if mainUrlText:
        extractEmailsFromUrlText(mainUrlText, url)  # Extract emails from main URL
        
        # Check if /contact page exists
        contactPageUrl = url + '/contact'
        contactPageText = htmlPageRead(contactPageUrl, i)
        if contactPageText:
            extractEmailsFromUrlText(contactPageText, contactPageUrl)  # Extract emails from /contact page
            
      

# Read URLs from Excel file
start = time.time()
df = pd.read_excel("test.xlsx")  
i = 0

# Function to read message from file
def read_message_from_file(file_path):
    with open(file_path, 'r') as file:
        message = file.read().strip()
    return message

# Assuming you have already defined the function emailsLeechFunc and the variable emails_by_website

# Your existing code
for index, row in df.iterrows():
    i += 1
    url = row['website']  # Assuming 'website' column contains the URLs
    if pd.notnull(url):
        emailsLeechFunc(url, i)

# Filter columns to include only phone_number, name, latitude, longitude, review_count, average_review, website, and email
df = df[['name', 'address', 'website', 'phone_number']]

# Rename the columns to more simple and understandable names
df.columns = ['name', 'address', 'website', 'phone_number']

# Function to create WhatsApp link with message
def generate_whatsapp_link(phone_number, message):
    return 'https://web.whatsapp.com/send?phone={}&text={}'.format(phone_number, message)

# Read message from file
message = read_message_from_file('message.txt')

# Iterate over rows and replace '{{name}}' with the corresponding company name
df['Message'] = ''
for index, row in df.iterrows():
    company_name = str(row['name'])  # Convert to string
    if company_name:  # Check if the company name is not empty
        encoded_name = urllib.parse.quote(company_name.encode('utf-8'))
    else:
        encoded_name = ""  # Replace with an empty string if the name is empty
    replaced_message = message.replace('{{name}}', encoded_name)
    df.at[index, 'Message'] = replaced_message
    
# Add WhatsApp Link column
df['WhatsApp Link'] = df.apply(lambda row: generate_whatsapp_link(row['phone_number'], row['Message']), axis=1)

# Write emails to the Excel file
for index, row in df.iterrows():
    url = row['website']
    if url in emails_by_website:
        df.at[index, 'Email'] = ', '.join(emails_by_website[url])
    else:
        df.at[index, 'Email'] = ''
# Save DataFrame to Excel with clickable links and expanded columns
excel_file = 'WhatsApp.xlsx'
writer = pd.ExcelWriter(excel_file, engine='xlsxwriter')

# Write DataFrame to Excel
df.to_excel(writer, index=False)

# Get the xlsxwriter workbook and worksheet objects
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# Add hyperlink format for WhatsApp Link column
hyperlink_format = workbook.add_format({'font_color': 'blue', 'underline': True})
worksheet.set_column('H:H', None, hyperlink_format)

# Add the hyperlinks to the WhatsApp Link column
for idx, row in df.iterrows():
    worksheet.write_url(idx + 1, 7, row['WhatsApp Link'], string='Open WhatsApp')

# Expand columns
for col_num, value in enumerate(df.columns.values):
    column_len = max(df[value].astype(str).map(len).max(), len(value)) + 2
    worksheet.set_column(col_num, col_num, column_len)

# Close the Pandas Excel writer and output the Excel file
writer._save()

# Drop rows where email is empty
df = df.dropna(subset=['Email'])

# Function to split emails if they are listed together with a comma
def split_emails(emails):
    separated_emails = set()
    for email in emails.split(','):
        email = email.strip()  # Remove any leading/trailing whitespace
        separated_emails.add(email)
    return separated_emails

# New DataFrame to store the updated data
new_data = []

# Dictionary to keep track of business names and their counts
business_counts = {}

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Split emails if they are listed together with a comma
    emails = row['Email']
    separated_emails = split_emails(emails)
        
    # Create a new row for each separated email
    for email in separated_emails:
        new_row = row.copy()  # Copy data from the original row
        new_row['Email'] = email  # Replace the 'Email' with the separated email
            
        # Change the business name if there are duplicated emails
        if len(separated_emails) > 1:
            business_name = row['name']
            count = business_counts.get(business_name, 1)
            new_row['name'] = f"{business_name} {count}"
            business_counts[business_name] = count + 1
                
        new_data.append(new_row)

# Create a new DataFrame with the updated data
new_df = pd.DataFrame(new_data)

# Save the new DataFrame back to the Excel file
new_df.to_excel("Emails.xlsx", index=False)
Transformez Votre Vision en Réalité

Contact Us