Skip to content

Commit

Permalink
emails
Browse files Browse the repository at this point in the history
  • Loading branch information
philipperemy committed Oct 1, 2024
1 parent abf83fc commit c0424d2
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 12 deletions.
32 changes: 32 additions & 0 deletions api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from waitress import serve

from names_dataset import NameDataset, NameWrapper
from names_dataset.emails import extract_names_from_email

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand Down Expand Up @@ -46,6 +47,37 @@ def str2bool(s: Union[bool, str]) -> bool:
return False


@app.route('/split', methods=['GET'])
def split():
try:
req = request
q = req.args.get('q')
if q is None:
return generate_output(
'provide a parameter q, for example '
'[email protected] or philipperemy', status=False
)
else:
first_name, last_name = extract_names_from_email(nd, q)
if first_name is not None:
result_first_name = nd.search(first_name)['first_name']
else:
result_first_name = None
if last_name is not None:
result_last_name = nd.search(last_name)['last_name']
else:
result_last_name = None
result_first_name['name'] = first_name
result_last_name['name'] = last_name
result = {
'first_name': result_first_name,
'last_name': result_last_name
}
return generate_output({'result': result}, status=True)
except Exception as e:
return generate_output({'error': str(e)}, status=True)


@app.route('/country_codes', methods=['GET'])
def country_codes():
try:
Expand Down
21 changes: 11 additions & 10 deletions names_dataset/emails.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

from names_dataset import NameDataset

nd = NameDataset()


def _compute_score(ranks: Dict):
values = {a: b for a, b in ranks['rank'].items() if b is not None}.values()
Expand All @@ -15,7 +13,7 @@ def _compute_score(ranks: Dict):
return -min(values)


def _score(candidate: str):
def _score(nd: NameDataset, candidate: str):
if len(candidate) == 0:
return float('-inf')
first_name = nd.search(candidate)['first_name']
Expand All @@ -33,8 +31,8 @@ def _score(candidate: str):


# Function to infer the best split between first and last name
def _infer_best_split(full_name: str):
max_score = _score(full_name)
def _infer_best_split(nd: NameDataset, full_name: str):
max_score = _score(nd, full_name)
best_split = (full_name, None)

# Try all possible ways to split the full_name
Expand All @@ -43,7 +41,7 @@ def _infer_best_split(full_name: str):
last = full_name[i:]

# Calculate total score for the split
total_score = _score(first) + _score(last)
total_score = _score(nd, first) + _score(nd, last)

# If this split has a higher score, update the best split
if total_score > max_score:
Expand All @@ -53,14 +51,17 @@ def _infer_best_split(full_name: str):
return best_split, max_score


def _general_score(candidate: str):
def _general_score(nd: NameDataset, candidate: str):
c = nd.search(candidate)
s1 = _compute_score(c['first_name'])
s2 = _compute_score(c['last_name'])
return max(s1, s2)


def extract_names_from_email(email: str):
def extract_names_from_email(nd: NameDataset, email: str):
if '@' not in email:
email += '@gmail.com'

email = ''.join([e for e in list(email) if not e.isnumeric()])

prefix, suffix = email.split('@')
Expand All @@ -76,7 +77,7 @@ def extract_names_from_email(email: str):
for e in ['.', '_', '-']:
if prefix.count(e) >= 2:
c_list = prefix.split(e)
scores = [_general_score(c) for c in c_list]
scores = [_general_score(nd, c) for c in c_list]
a, b = np.array(c_list)[np.argsort(scores)][-2:]
email = f'{a}.{b}@{suffix}'

Expand All @@ -102,7 +103,7 @@ def extract_names_from_email(email: str):

if not had_matched:
prefix = email.split('@')[0]
(first_name, last_name), max_score = _infer_best_split(prefix)
(first_name, last_name), max_score = _infer_best_split(nd, prefix)

if first_name is not None and len(first_name) == 1:
first_name = None
Expand Down
9 changes: 7 additions & 2 deletions tests/test_from_emails.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest

from names_dataset import NameDataset
from names_dataset.emails import extract_names_from_email


Expand Down Expand Up @@ -28,6 +29,9 @@ def test_1(self):
'[email protected]',
'[email protected]',
]
inputs2 = []
for i in inputs:
inputs2.append(i.split('@')[0])

outputs = [
[None, None],
Expand All @@ -52,8 +56,9 @@ def test_1(self):
['philippe', 'remy'],
]

for input_, output_ in zip(inputs, outputs):
first_name, last_name = extract_names_from_email(input_)
nd = NameDataset()
for input_, output_ in zip(inputs2, outputs):
first_name, last_name = extract_names_from_email(nd, input_)
print(input_)
self.assertEqual(output_[0], first_name)
self.assertEqual(output_[1], last_name)
Expand Down

0 comments on commit c0424d2

Please sign in to comment.