Commit 4c8ea465 authored by T. Fischer's avatar T. Fischer

Greatly improving email extraction script

- Supporting email addresses across multiple lines
- Cleaning/filtering both full names and email addresses
- Guessing full names from email addresses: [email protected] gives 'First Last'
- Adding comments
... and many other changes and fixes.
parent 73583c23
......@@ -2,43 +2,91 @@
import os
import sys
import re
import string
import recentmail
emailre = re.compile(
r"(?:(?P<fullname>(?<=[:,]\s)(?:[^:\",;'*<@>]+[^:\",;'*<@>\s]*[^:\",;'*<@>]+\b|'[^']+'|\"[^\"]+\"))\s+)?(?P<openinganglebracket><)?(?=[A-Z0-9][[email protected]_%+-]{5,253})(?P<emailaddress>[A-Z0-9._%+-]{1,64}@(?:(?=[A-Z0-9-]{1,63}\.)[A-Z0-9]+(?:-[A-Z0-9]+)*\.){1,8}[A-Z]{2,63})(?(openinganglebracket)>|)", flags=re.IGNORECASE | re.DOTALL | re.MULTILINE)
sequenceofspacesre = re.compile(r"\s+", flags=re.MULTILINE)
def get_account(emailaddress):
components = emailaddress.split("@")
if len(components) != 2 or any(len(x) == 0 for x in components):
return None
else:
return components[0]
def get_email_addresses(text):
emailre = re.compile(
r"(?:(?P<fullname>(?<=[:,] )(?:[^:\",;'*<@>]+[^:\",;'*<@> ]*[^:\",;'*<@>]+\b|'[^']+'|\"[^\"]+\"))\s+)?(?P<openinganglebracket><)?(?=[A-Z0-9][[email protected]_%+-]{5,253})(?P<emailaddress>[A-Z0-9._%+-]{1,64}@(?:(?=[A-Z0-9-]{1,63}\.)[A-Z0-9]+(?:-[A-Z0-9]+)*\.){1,8}[A-Z]{2,63})(?(openinganglebracket)>|)", flags=re.IGNORECASE)
emailaddresstuple = emailre.findall(text)
for (fullname, _, emailaddress) in emailaddresstuple:
emailaddress = emailaddress.lower().strip()
# If email address is empty or None, skip/continue
if len(emailaddress) == 0:
continue
# Skip if email address contains patterns pointing to automated systems
if any(needle in emailaddress for needle in ["bugzilla", "noreply", "no-reply", "jabber"]):
continue
if any(emailaddress.endswith(needle) for needle in ["-owner", "-reply", "-bounces", "-request", "-unregister", ".svar"]):
continue
if any(emailaddress.startswith(needle) for needle in ["mailman."]):
continue
# Clean full name by contracting or removing sequences of whitespace
fullname = sequenceofspacesre.sub(" ", fullname.strip())
# If the full name contains the email address, such a full name should not be used any further
if emailaddress in fullname.lower():
fullname = None
else:
if any(needle in emailaddress for needle in ["bugzilla", "noreply", "no-reply", "-daemon", "jabber"]):
continue
# Generic pattern: if mail address looks like [email protected] and no full name is given,
# guess that full name is "First Last"
account = get_account(emailaddress)
if account and not fullname:
accountcomponents = account.split(".")
if len(accountcomponents) >= 2 and all(len(x) > 2 for x in accountcomponents) and all(x[0] in string.ascii_lowercase for x in accountcomponents):
accountcomponents = [x[0].upper()+x[1:]
for x in accountcomponents]
fullname = " ".join(accountcomponents)
# Process the full name if there is any set
if fullname:
# Skip/continue if full name contains certain substrings
if any(needle in fullname for needle in ["Jabber"]):
continue
# Clean full name from certain substrings
for needle in ["US Nilex "]:
fullname = fullname.replace(needle, "")
# Remove surrounding quotation marks
if len(fullname) > 3 and ((fullname[0] == '"' and fullname[-1] == '"') or (fullname[0] == "'" and fullname[-1] == "'")):
fullname = fullname[1:-1]
# Being a generator, return next fullname-email tuple
yield (fullname if fullname else None, emailaddress)
def process_mailtext():
for line in sys.stdin:
print(line, end='')
text = sys.stdin.read()
# This script being used as 'display filter' for mutt,
# print text as it is read
print(text, end='')
for (fullname, emailaddress) in get_email_addresses(line):
nameset = recentmail.uniqueemailaddresses.setdefault(
emailaddress, set())
if fullname:
nameset.add(fullname)
# Iterate over all fullname-email tuples extracted from text
for (fullname, emailaddress) in get_email_addresses(text):
# Retrieve already known full names for current email address
# (or new, empty set if encountering email address for first time)
nameset = recentmail.uniqueemailaddresses.setdefault(
emailaddress, set())
if fullname:
# Add full name to email address's set
nameset.add(fullname)
username = os.getlogin()
recentmail.load(
"/tmp/.{username}.recentemailaddresses".format(username=os.getlogin()))
"/tmp/.{username}.recentemailaddresses".format(username=username))
process_mailtext()
recentmail.save(
"/tmp/.{username}.recentemailaddresses".format(username=os.getlogin()))
"/tmp/.{username}.recentemailaddresses".format(username=username))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment