Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cleanco/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@


def typesources():
"business types / abbreviations sorted by length of business type"
"""business types / abbreviations sorted by length of business type"""
types = []
for business_type in terms_by_type:
for item in terms_by_type[business_type]:
Expand All @@ -32,7 +32,7 @@ def typesources():


def countrysources():
"business countries / type abbreviations sorted by length of type abbreviations"
"""business countries / type abbreviations sorted by length of type abbreviations"""
countries = []
for country in terms_by_country:
for item in terms_by_country[country]:
Expand All @@ -42,20 +42,20 @@ def countrysources():


def matches(name, sources):
"get types or countries matching with the legal terms in name"
"""get types or countries matching with the legal terms in name"""

name = strip_tail(name)
parts = name.split()
nparts = [normalized(p) for p in parts]
matches = []
matched = []
for classifier, term in sources:
nterm = normalized(term)
try:
idx = nparts.index(nterm)
except ValueError:
pass
else:
matches.append(classifier)
matched.append(classifier)

return matches
return matched

17 changes: 8 additions & 9 deletions cleanco/clean.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Functions to help clean & normalize business names.

See http://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details
See https://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details
on Unicode normalization and the NFKD normalization used here.

Basic usage:
Expand All @@ -13,17 +13,16 @@

import functools
import operator
from collections import OrderedDict
import re
import unicodedata
from .termdata import terms_by_type, terms_by_country
from .non_nfkd_map import NON_NFKD_MAP

tail_removal_rexp = re.compile(r"[^\.\w]+$", flags=re.UNICODE)
tail_removal_rexp = re.compile(r"[^.\w]+$", flags=re.UNICODE)


def get_unique_terms():
"retrieve all unique terms from termdata definitions"
"""retrieve all unique terms from termdata definitions"""
ts = functools.reduce(operator.iconcat, terms_by_type.values(), [])
cs = functools.reduce(operator.iconcat, terms_by_country.values(), [])
return set(ts + cs)
Expand All @@ -46,25 +45,25 @@ def strip_punct(t):


def normalize_terms(terms):
"normalize terms"
"""normalize terms"""
return (strip_punct(remove_accents(t)) for t in terms)


def strip_tail(name):
"get rid of all trailing non-letter symbols except the dot"
"""get rid of all trailing non-letter symbols except the dot"""
match = re.search(tail_removal_rexp, name)
if match is not None:
name = name[: match.span()[0]]
return name


def normalized(text):
"caseless Unicode normalization"
"""caseless Unicode normalization"""
return remove_accents(text)


def prepare_default_terms():
"construct an optimized term structure for basename extraction"
"""construct an optimized term structure for basename extraction"""
terms = get_unique_terms()
nterms = normalize_terms(terms)
ntermparts = (t.split() for t in nterms)
Expand All @@ -74,7 +73,7 @@ def prepare_default_terms():


def custom_basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs):
"return cleaned base version of the business name"
"""return cleaned base version of the business name"""

name = strip_tail(name)
nparts = name.split()
Expand Down
Loading
Loading