import re
import pandas as pd
import urllib.parse
import xlsxwriter
import urllib.request
import time
import dns.resolver
import socket
emailRegex = re.compile(r'''
#example :
#something-.+_@somedomain.com
(
([a-zA-Z0-9_.+]+
@
[a-zA-Z0-9_.+]+)
)
''', re.VERBOSE)
# Email validation function
def is_valid_email(email):
# Syntax validation
if not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
return False
# Check if the email contains more than 8 digits
if len(re.findall(r'\d', email)) > 8:
return False
# Extract domain from email
domain = email.split('@')[1]
# Check against popular email providers
popular_providers = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'aol.com']
if domain.lower() in popular_providers:
return True
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp']
for ext in image_extensions:
if email.lower().endswith(ext):
return False
try:
# DNS MX record lookup
dns.resolver.resolve(domain, 'MX')
return True
except (dns.resolver.NXDOMAIN, dns.resolver.NoAnswer):
pass
except Exception as e:
print(f"DNS resolver error: {e}")
pass
try:
# SPF record check
dns.resolver.resolve(domain, 'TXT')
spf_record = dns.resolver.resolve(domain, 'TXT').response.answer[0].to_text()
if 'v=spf1' in spf_record.lower():
return True
else:
print("SPF record not found")
pass
except Exception as e:
print(f"SPF record error: {e}")
pass
try:
# DKIM record check
dns.resolver.resolve(f'_domainkey.{domain}', 'TXT')
return True
except Exception as e:
print(f"DKIM record error: {e}")
pass
try:
# SMTP verification
socket.getaddrinfo(domain, 25, socket.AF_INET, socket.SOCK_STREAM)
return True
except Exception as e:
print(f"SMTP verification error: {e}")
pass
print('No hope for this email , it can t be checked ')
return False
# Dictionary to store emails by website
emails_by_website = {}
# Extacting Emails
def extractEmailsFromUrlText(urlText, website):
extractedEmail = emailRegex.findall(urlText)
for email in extractedEmail:
email = email[0].lower()
print("--------------------------------")
print(email)
if is_valid_email(email): # Validate the email address
if website in emails_by_website:
emails_by_website[website].add(email) # Add the email to the set
else:
emails_by_website[website] = {email} # Create a new set for the website
# HtmlPage Read Func
def htmlPageRead(url, i):
print(url)
# Ensure URL has a scheme (http or https)
if not url.startswith(('http://', 'https://')):
url = 'http://' + url # Assuming http if no scheme is provided
try:
start = time.time()
headers = {'User-Agent': 'Mozilla/5.0'}
request = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(request)
urlHtmlPageRead = response.read()
urlText = urlHtmlPageRead.decode('utf-8') # Specify UTF-8 encoding
print("%s.%s\tFetched in : %s" % (i, url, (time.time() - start)))
return urlText
except Exception as e: # Catch the specific exception and print it
print(f"Error fetching {url}: {e}")
return None
# EmailsLeechFunction
def emailsLeechFunc(url, i):
mainUrlText = htmlPageRead(url, i)
print(mainUrlText)
if mainUrlText:
extractEmailsFromUrlText(mainUrlText, url) # Extract emails from main URL
# Check if /contact page exists
contactPageUrl = url + '/contact'
contactPageText = htmlPageRead(contactPageUrl, i)
if contactPageText:
extractEmailsFromUrlText(contactPageText, contactPageUrl) # Extract emails from /contact page
# Read URLs from Excel file
start = time.time()
df = pd.read_excel("test.xlsx")
i = 0
# Function to read message from file
def read_message_from_file(file_path):
with open(file_path, 'r') as file:
message = file.read().strip()
return message
# Assuming you have already defined the function emailsLeechFunc and the variable emails_by_website
# Your existing code
for index, row in df.iterrows():
i += 1
url = row['website'] # Assuming 'website' column contains the URLs
if pd.notnull(url):
emailsLeechFunc(url, i)
# Filter columns to include only phone_number, name, latitude, longitude, review_count, average_review, website, and email
df = df[['name', 'address', 'website', 'phone_number']]
# Rename the columns to more simple and understandable names
df.columns = ['name', 'address', 'website', 'phone_number']
# Function to create WhatsApp link with message
def generate_whatsapp_link(phone_number, message):
return 'https://web.whatsapp.com/send?phone={}&text={}'.format(phone_number, message)
# Read message from file
message = read_message_from_file('message.txt')
# Iterate over rows and replace '{{name}}' with the corresponding company name
df['Message'] = ''
for index, row in df.iterrows():
company_name = str(row['name']) # Convert to string
if company_name: # Check if the company name is not empty
encoded_name = urllib.parse.quote(company_name.encode('utf-8'))
else:
encoded_name = "" # Replace with an empty string if the name is empty
replaced_message = message.replace('{{name}}', encoded_name)
df.at[index, 'Message'] = replaced_message
# Add WhatsApp Link column
df['WhatsApp Link'] = df.apply(lambda row: generate_whatsapp_link(row['phone_number'], row['Message']), axis=1)
# Write emails to the Excel file
for index, row in df.iterrows():
url = row['website']
if url in emails_by_website:
df.at[index, 'Email'] = ', '.join(emails_by_website[url])
else:
df.at[index, 'Email'] = ''
# Save DataFrame to Excel with clickable links and expanded columns
excel_file = 'WhatsApp.xlsx'
writer = pd.ExcelWriter(excel_file, engine='xlsxwriter')
# Write DataFrame to Excel
df.to_excel(writer, index=False)
# Get the xlsxwriter workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Add hyperlink format for WhatsApp Link column
hyperlink_format = workbook.add_format({'font_color': 'blue', 'underline': True})
worksheet.set_column('H:H', None, hyperlink_format)
# Add the hyperlinks to the WhatsApp Link column
for idx, row in df.iterrows():
worksheet.write_url(idx + 1, 7, row['WhatsApp Link'], string='Open WhatsApp')
# Expand columns
for col_num, value in enumerate(df.columns.values):
column_len = max(df[value].astype(str).map(len).max(), len(value)) + 2
worksheet.set_column(col_num, col_num, column_len)
# Close the Pandas Excel writer and output the Excel file
writer._save()
# Drop rows where email is empty
df = df.dropna(subset=['Email'])
# Function to split emails if they are listed together with a comma
def split_emails(emails):
separated_emails = set()
for email in emails.split(','):
email = email.strip() # Remove any leading/trailing whitespace
separated_emails.add(email)
return separated_emails
# New DataFrame to store the updated data
new_data = []
# Dictionary to keep track of business names and their counts
business_counts = {}
# Iterate through each row in the DataFrame
for index, row in df.iterrows():
# Split emails if they are listed together with a comma
emails = row['Email']
separated_emails = split_emails(emails)
# Create a new row for each separated email
for email in separated_emails:
new_row = row.copy() # Copy data from the original row
new_row['Email'] = email # Replace the 'Email' with the separated email
# Change the business name if there are duplicated emails
if len(separated_emails) > 1:
business_name = row['name']
count = business_counts.get(business_name, 1)
new_row['name'] = f"{business_name} {count}"
business_counts[business_name] = count + 1
new_data.append(new_row)
# Create a new DataFrame with the updated data
new_df = pd.DataFrame(new_data)
# Save the new DataFrame back to the Excel file
new_df.to_excel("Emails.xlsx", index=False)