diff --git a/cleanco/classify.py b/cleanco/classify.py index baf5480..8a93bb1 100644 --- a/cleanco/classify.py +++ b/cleanco/classify.py @@ -22,7 +22,7 @@ def typesources(): - "business types / abbreviations sorted by length of business type" + """business types / abbreviations sorted by length of business type""" types = [] for business_type in terms_by_type: for item in terms_by_type[business_type]: @@ -32,7 +32,7 @@ def typesources(): def countrysources(): - "business countries / type abbreviations sorted by length of type abbreviations" + """business countries / type abbreviations sorted by length of type abbreviations""" countries = [] for country in terms_by_country: for item in terms_by_country[country]: @@ -42,12 +42,12 @@ def countrysources(): def matches(name, sources): - "get types or countries matching with the legal terms in name" + """get types or countries matching with the legal terms in name""" name = strip_tail(name) parts = name.split() nparts = [normalized(p) for p in parts] - matches = [] + matched = [] for classifier, term in sources: nterm = normalized(term) try: @@ -55,7 +55,7 @@ def matches(name, sources): except ValueError: pass else: - matches.append(classifier) + matched.append(classifier) - return matches + return matched diff --git a/cleanco/clean.py b/cleanco/clean.py index 0454c1c..ba4f6f7 100644 --- a/cleanco/clean.py +++ b/cleanco/clean.py @@ -1,6 +1,6 @@ """Functions to help clean & normalize business names. -See http://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details +See https://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details on Unicode normalization and the NFKD normalization used here. Basic usage: @@ -13,17 +13,16 @@ import functools import operator -from collections import OrderedDict import re import unicodedata from .termdata import terms_by_type, terms_by_country from .non_nfkd_map import NON_NFKD_MAP -tail_removal_rexp = re.compile(r"[^\.\w]+$", flags=re.UNICODE) +tail_removal_rexp = re.compile(r"[^.\w]+$", flags=re.UNICODE) def get_unique_terms(): - "retrieve all unique terms from termdata definitions" + """retrieve all unique terms from termdata definitions""" ts = functools.reduce(operator.iconcat, terms_by_type.values(), []) cs = functools.reduce(operator.iconcat, terms_by_country.values(), []) return set(ts + cs) @@ -46,12 +45,12 @@ def strip_punct(t): def normalize_terms(terms): - "normalize terms" + """normalize terms""" return (strip_punct(remove_accents(t)) for t in terms) def strip_tail(name): - "get rid of all trailing non-letter symbols except the dot" + """get rid of all trailing non-letter symbols except the dot""" match = re.search(tail_removal_rexp, name) if match is not None: name = name[: match.span()[0]] @@ -59,12 +58,12 @@ def strip_tail(name): def normalized(text): - "caseless Unicode normalization" + """caseless Unicode normalization""" return remove_accents(text) def prepare_default_terms(): - "construct an optimized term structure for basename extraction" + """construct an optimized term structure for basename extraction""" terms = get_unique_terms() nterms = normalize_terms(terms) ntermparts = (t.split() for t in nterms) @@ -74,7 +73,7 @@ def prepare_default_terms(): def custom_basename(name, terms, suffix=True, prefix=False, middle=False, **kwargs): - "return cleaned base version of the business name" + """return cleaned base version of the business name""" name = strip_tail(name) nparts = name.split() diff --git a/cleanco/termdata.py b/cleanco/termdata.py index 0af705c..0175bae 100644 --- a/cleanco/termdata.py +++ b/cleanco/termdata.py @@ -1,145 +1,1229 @@ terms_by_type = { - 'Corporation': ['company', 'incorporated', 'corporation', 'corp.', 'corp', 'inc', - '& co.', '& co', 'inc.', 's.p.a.', 'n.v.', 'a.g.', 'ag', 'nuf', 's.a.', 's.f.', - 'oao', 'co.', 'co' - ], - 'General Partnership': ['soc.col.', 'stg', 'd.n.o.', 'ltda.', 'v.o.s.', 'a spol.', - u've\xc5\x99. obch. spol.', 'kgaa', 'o.e.', 's.f.', 's.n.c.', 's.a.p.a.', 'j.t.d.', - 'v.o.f.', 'sp.j.', 'og', 'sd', ' i/s', 'ay', 'snc', 'oe', 'bt.', 's.s.', 'mb', - 'ans', 'da', 'o.d.', 'hb', 'pt' - ], - 'Joint Stock / Unlimited': ['unltd', 'ultd', 'sal', 'unlimited', 'saog', 'saoc', 'aj', - 'yoaj', 'oaj', 'akc. spol.', 'a.s.' - ], - 'Joint Venture': ['esv', 'gie', 'kv.', 'qk'], - 'Limited': ['pty. ltd.', 'pty ltd', 'ltd', 'l.t.d.', 'bvba', 'd.o.o.', 'ltda', 'gmbh', - 'g.m.b.h', 'kft.', 'kht.', 'zrt.', 'ehf.', 's.a.r.l.', 'd.o.o.e.l.', 's. de r.l.', - 'b.v.', 'tapui', - 'sp. z.o.o.', 'sp. z o.o.', 'spółka z o.o.', - 's.r.l.', 's.l.', 's.l.n.e.', 'ood', 'oy', 'rt.', - 'teo', 'uab', 'scs', 'sprl', 'limited', 'bhd.', 'sdn. bhd.', 'sdn bhd', 'as', - 'lda.', 'tov', 'pp' - ], - 'Limited Liability Company': ['pllc', 'llc', 'l.l.c.', 'plc.', 'plc', 'hf.', 'oyj', - 'a.e.', 'nyrt.', 'p.l.c.', 'sh.a.', 's.a.', 's.r.l.', 'srl.', 'srl', 'aat', '3at', 'd.d.', - 's.r.o.', 'spol. s r.o.', 's.m.b.a.', 'smba', 'sarl', 'nv', 'sa', 'aps', - 'a/s', 'p/s', 'sae', 'sasu', 'eurl', 'ae', 'cpt', 'as', 'ab', 'asa', 'ooo', 'dat', - 'vat', 'zat', 'mchj', 'a.d.' - ], - 'Limited Liability Limited Partnership': ['lllp', 'l.l.l.p.'], - 'Limited Liability Partnership': ['llp', 'l.l.p.', 'sp.p.', 's.c.a.', 's.c.s.'], - 'Limited Partnership': ['gmbh & co. kg', 'lp', 'l.p.', 's.c.s.', - 's.c.p.a', 'comm.v', 'k.d.', 'k.d.a.', 's. en c.', 'e.e.', 's.a.s.', 's. en c.', - 'c.v.', 's.k.a.', 'sp.k.', 's.cra.', 'ky', 'scs', 'kg', 'kd', 'k/s', 'ee', 'secs', - 'kda', 'ks', 'kb','kt' - ], - 'Mutual Fund': ['sicav'], - 'No Liability': ['nl'], - 'Non-Profit': ['vzw', 'ses.', 'gte.'], - 'Private Company': ['private', 'pte', 'xk'], - 'Professional Corporation': ['p.c.', 'vof', 'snc'], - 'Professional Limited Liability Company': ['pllc', 'p.l.l.c.'], - 'Sole Proprietorship': ['e.u.', 's.p.', 't:mi', 'tmi', 'e.v.', 'e.c.', 'et', 'obrt', - 'fie', 'ij', 'fop', 'xt' - ] + "Additional Liability Company": [ + "tovarystvo z dodatkvoiu vidpovidalnistiu", + "товариство з додатковою відповідальністю", + ], + "Association of Co-Owners": [ + "association des coproprietaires de", + "associação comunitária dos moradores", + "společenství vlastníků jednotek", + "syndicat de la copropriété du", + "vereniging van mede eigenaars van", + ], + "Cooperative Company with Limited Liability": [ + "eingetragene genossenschaft mit beschrankter haftpflicht", + ], + "Corporation": [ + "& co", + "& co.", + "a.g.", + "ag", + "co", + "co.", + "company", + "corp", + "corp.", + "corporation", + "inc", + "inc.", + "incorporated", + "n.v.", + "nuf", + "oao", + "s.a.", + "s.f.", + "s.p.a.", + ], + "General Partnership": [ + " i/s", + "a spol.", + "ans", + "ay", + "bt.", + "d.n.o.", + "da", + "hb", + "j.t.d.", + "kgaa", + "ltda.", + "mb", + "o.d.", + "o.e.", + "oe", + "og", + "pt", + "s.a.p.a.", + "s.f.", + "s.n.c.", + "s.s.", + "sd", + "snc", + "soc.col.", + "sp.j.", + "stg", + "v.o.f.", + "v.o.s.", + "veř. obch. spol.", + ], + "Joint Stock / Unlimited": [ + "a.s.", + "aj", + "akc. spol.", + "akciová společnost", + "aktiengesellschaft", + "oaj", + "sal", + "saoc", + "saog", + "ultd", + "unlimited", + "unltd", + "yoaj", + ], + "Joint Venture": [ + "esv", + "gie", + "kv.", + "qk", + ], + "Limited": [ + "as", + "b.v.", + "bhd.", + "bvba", + "d.o.o.", + "d.o.o.e.l.", + "ehf.", + "g.m.b.h", + "gesellschaft mit beschränkter haftung", + "gmbh", + "kft.", + "kht.", + "l.t.d.", + "lda.", + "limited", + "ltd", + "ltda", + "ood", + "oy", + "pp", + "pty ltd", + "pty. ltd.", + "rt.", + "s. de r.l.", + "s. de r.l. de c.v.", + "s.a.r.l.", + "s.l.", + "s.l.n.e.", + "s.r.l.", + "sdn bhd", + "sdn. bhd.", + "sp. z o.o.", + "sp. z.o.o.", + "sprl", + "spółka z o.o.", + "spółka z ograniczoną odpowiedzialnością", + "tapui", + "teo", + "tov", + "tovarystvo z obmezhenoiu vidpovidalnistiu", + "uab", + "zrt.", + "товариство з обмеженою відповідальністю", + ], + "Limited Liability Company": [ + "3at", + "a.d.", + "a.e.", + "a/s", + "aat", + "ab", + "ae", + "aps", + "as", + "asa", + "cpt", + "d.d.", + "dat", + "eurl", + "exploitation agricole à responsabilité limitée", + "hf.", + "l.l.c.", + "llc", + "mchj", + "nv", + "nyrt.", + "ooo", + "oyj", + "p.l.c.", + "p/s", + "plc", + "plc.", + "pryvatne pidpryyemstvo", + "s. r. o.", + "s.a.", + "s.m.b.a.", + "s.r.l.", + "s.r.o.", + "sa", + "sabiedrība ar ierobežotu atbildību", + "sae", + "sarl", + "sasu", + "sh.a.", + "smba", + "spol. s r. o.", + "spol. s r.o.", + "spol. s.r.o.", + "společnost s ručením omezeným", + "spoločnosť s ručením obmedzeným", + "srl", + "srl.", + "vat", + "zat", + "приватне підприємство", + ], + "Limited Liability Company in Liquidation": [ + "s.r.o. v likvidácii", + "s.r.o. v likvidácii", + "spol. s r.o. v likvidaci", + "spol. s r.o. v likvidácii", + "spol. s.r.o. v likvidaci", + "spol. s.r.o. v likvidácii", + "spółka z ograniczoną odpowiedzialnością likwidacji", + ], + "Limited Liability Limited Partnership": [ + "l.l.l.p.", + "lllp", + ], + "Limited Liability Partnership": [ + "l.l.p.", + "llp", + "s.c.a.", + "s.c.s.", + "sp.p.", + ], + "Limited Partnership": [ + "c.v.", + "comm.v", + "e.e.", + "ee", + "gmbh & co. kg", + "k.d.", + "k.d.a.", + "k.s.", + "k/s", + "kb", + "kd", + "kda", + "kg", + "komanditná spoločnosť", + "komanditní společnost", + "kommanditgesellschaft", + "ks", + "kt", + "ky", + "l.p.", + "lp", + "s. en c.", + "s.a.s.", + "s.c.p.a", + "s.c.s.", + "s.cra.", + "s.k.a.", + "scs", + "secs", + "sp.k.", + "spółka komandytowa", + ], + "Mutual Fund": [ + "sicav", + ], + "No Liability": [ + "nl", + ], + "Non-Profit": [ + "gte.", + "ses.", + "vzw", + ], + "Private Company": [ + "private", + "pte", + "xk", + ], + "Professional Corporation": [ + "p.c.", + "vof", + ], + "Professional Limited Liability Company": [ + "p.l.l.c.", + "pllc", + ], + "Sole Proprietorship": [ + "e.c.", + "e.u.", + "e.v.", + "et", + "fie", + "fop", + "ij", + "obrt", + "s.p.", + "t:mi", + "tmi", + "xt", + ], } terms_by_country = { - 'Albania': ['sh.a.', 'sh.p.k.'], - 'Argentina': ['s.a.', 's.r.l.', 's.c.p.a', 'scpa', 's.c.e i.', 's.e.', 's.g.r', - 'soc.col.' - ], - 'Australia': ['nl', 'pty. ltd.', 'pty ltd'], - 'Austria': ['e.u.', 'stg', 'gesbr', 'a.g.', 'ag', 'og', 'kg'], - 'Belarus': ['aat', '3at'], - 'Belgium': ['esv', 'vzw', 'vof', 'snc', 'comm.v', 'scs', 'bvba', 'sprl', 'cvba', - 'cvoa', 'sca', 'sep', 'gie' - ], - 'Bosnia / Herzegovina': ['d.d.', 'a.d.', 'd.n.o.', 'd.o.o.', 'k.v.', 's.p.'], - 'Brazil': ['ltda', 's.a.', 'pllc', 'ad', 'adsitz', 'ead', 'et', 'kd', 'kda', 'sd'], - 'Bulgaria': ['ad', 'adsitz', 'ead', 'et', 'kd', 'kda', 'sd'], - 'Cambodia': ['gp', 'sm pte ltd.', 'pte ltd.', 'plc ltd.', 'peec', 'sp'], - 'Canada': ['gp', 'lp', 'sp'], - 'Chile': ['eirl', 's.a.', 'sgr', 's.g.r.', 'ltda', 's.p.a.', 'sa', 's. en c.', - 'ltda.' - ], - 'Columbia': ['s.a.', 'e.u.', 's.a.s.', 'suc. de descendants', 'sca'], - 'Croatia': ['d.d.', 'd.o.o.', 'obrt'], - 'Czech Republic': ['a.s.', 'akc. spol.', 's.r.o.', 'spol. s r.o.', 'v.o.s.', u've\xc5\x99. obch. spol.', 'a spol.', 'k.s.', 'kom. spol.', 'kom. spol.'], - 'Denmark': ['i/s', 'a/s', 'k/s', 'p/s', 'amba', 'a.m.b.a.', 'fmba', 'f.m.b.a.', 'smba', - 's.m.b.a.', 'g/s' - ], - 'Dominican Republic': ['c. por a.', 'cxa', 's.a.', 's.a.s.', 'srl.', 'srl', 'eirl.', 'sa', - 'sas' - ], - 'Ecuador': ['s.a.', 'c.a.', 'sa', 'ep'], - 'Egypt': ['sae'], - 'Estonia': ['fie', 'oü', 'as'], - 'Finland': ['t:mi', 'tmi', 'as oy', 'as.oy', 'ay', 'ky', 'oy', 'oyj', 'ok'], - 'France': ['sicav', 'sarl', 'sogepa', 'ei', 'eurl', 'sasu', 'fcp', 'gie', 'sep', 'snc', - 'scs', 'sca', 'scop', 'sem', 'sas' - ], - 'Germany': ['gmbh & co. kg', 'e.g.', 'e.v.', 'gbr', 'ohg', 'partg', - 'kgaa', 'gmbh', 'g.m.b.h.', 'ag', 'mbh & co. kg' - ], - 'Greece': ['a.e.', 'ae', 'e.e.', 'ee', 'epe', 'e.p.e.', 'mepe', 'm.e.p.e.', 'o.e.', - 'oe', 'ovee', 'o.v.e.e.' - ], - 'Guatemala': ['s.a.', 'sa'], - 'Haiti': ['sa'], - 'Hong Kong': ['ltd', 'unltd', 'ultd', 'limited'], - 'Hungary': ['e.v.', 'e.c.', 'bt.', 'kft.', 'kht.', 'kkt.', 'k.v.', 'zrt.', 'nyrt', - 'ev', 'ec', 'rt.' - ], - 'Iceland': ['ehf.', 'hf.', 'ohf.', 's.f.', 'ses.'], - 'India': ['pvt. ltd.', 'ltd.', 'psu', 'pse'], - 'Indonesia': ['ud', 'fa', 'pt'], - 'Ireland': ['cpt', 'teo'], - 'Israel': ['b.m.', 'bm', 'ltd', 'limited'], - 'Italy': ['s.n.c.', 's.a.s.', 's.p.a.', 's.a.p.a.', 's.r.l.', 's.c.r.l.', 's.s.'], - 'Japan': ['k.k.','g.k.','gk','y.k.'], - 'Latvia': ['as', 'sia', 'ik', 'ps', 'ks'], - 'Lebanon': ['sal'], - 'Lithuania': ['uab', 'ab', 'ij', 'mb'], - 'Luxemborg': ['s.a.', 's.a.r.l.', 'secs'], - 'Macedonia': ['d.o.o.', 'd.o.o.e.l', 'k.d.a.', 'j.t.d.', 'a.d.', 'k.d.'], - 'Malaysia': ['bhd.', 'sdn. bhd.'], - 'Mexico': ['s.a.', 's. de. r.l.', 's. en c.', 's.a.b.', 's.a.p.i.', 's.a. de c.v.'], - 'Mongolia': ['xk', 'xxk'], - 'Netherlands': ['v.o.f.', 'c.v.', 'b.v.', 'n.v.'], - 'New Zealand': ['tapui', 'ltd', 'limited'], - 'Nigeria': ['gte.', 'plc', 'ltd.', 'ultd.'], - 'Norway': ['asa', 'as', 'ans', 'ba', 'bl', 'da', 'etat', 'fkf', 'hf', 'iks', 'kf', - 'ks', 'nuf', 'rhf', 'sf' - ], - 'Oman': ['saog', 'saoc'], - 'Pakistan': ['ltd.', 'pvt. ltd.', 'ltd', 'limited'], - 'Peru': ['sa', 's.a.', 's.a.a.'], - 'Philippines': ['coop.', 'corp.', 'corp', 'ent.', 'inc.', 'inc', 'llc', 'l.l.c.', - 'ltd.' - ], - 'Poland': ['p.p.', 's.k.a.', 'sp.j.', 'sp.k.', 'sp.p.', 'sp. z.o.o.', 's.c.', 's.a.'], - 'Portugal': ['lda.', 'crl', 's.a.', 's.f.', 'sgps'], - 'Romania': ['s.c.a.', 's.c.s.', 's.n.c.', 's.r.l.', 'o.n.g.', 's.a.'], - 'Russia': ['ooo', 'oao', 'zao', '3ao', 'пао', 'оао', 'ооо'], - 'Serbia': ['d.o.o.', 'a.d.', 'k.d.', 'o.d.'], - 'Singapore': ['bhd', 'pte ltd', 'sdn bhd', 'llp', 'l.l.p.', 'ltd.', 'pte', 'pte. ltd.'], - 'Slovenia': ['d.d.', 'd.o.o.', 'd.n.o.', 'k.d.', 's.p.'], - 'Slovakia': ['a.s.', 'akc. spol.', 's.r.o.', 'spol. s r.o.', 'k.s.', 'kom. spol.', 'v.o.s.', 'a spol.'], - 'Spain': ['s.a.', 's.a.d.', 's.l.', 's.l.l.', 's.l.n.e.', 's.c.', 's.cra', 's.coop', - 'sal', 'sccl' - ], - 'Sweden': ['ab', 'hb', 'kb'], - 'Switzerland': ['ab', 'sa', 'gmbh', 'g.m.b.h.', 'sarl', 'sagl'], - 'Turkey': ['koop.'], - 'Ukraine': ['dat', 'fop', 'kt', 'pt', 'tdv', 'tov', 'pp', 'vat', 'zat', 'at'], - 'United Kingdom': ['plc.', 'plc', 'cic', 'cio', 'l.l.p.', 'llp', 'l.p.', 'lp', 'ltd.', - 'ltd', 'limited' - ], - 'United States of America': ['llc', 'inc.', 'corporation', 'incorporated', 'company', - 'limited', 'corp.', 'inc.', 'inc', 'llp', 'l.l.p.', 'pllc', 'and company', - '& company', 'inc', 'inc.', 'corp.', 'corp', 'ltd.', 'ltd', '& co.', '& co', 'co.', - 'co', 'lp' - ], - 'Uzbekistan': ['mchj', 'qmj', 'aj', 'oaj', 'yoaj', 'xk', 'xt', 'ok', 'uk', 'qk'] + "Albania": [ + "sh.a.", + "sh.p.k.", + ], + "Argentina": [ + "s.a.", + "s.c.e i.", + "s.c.p.a", + "s.e.", + "s.g.r", + "s.r.l.", + "scpa", + "soc.col.", + ], + "Australia": [ + "nl", + "pty ltd", + "pty. ltd.", + ], + "Austria": [ + "a.g.", + "ag", + "e.u.", + "gesbr", + "kg", + "og", + "stg", + ], + "Belarus": [ + "3at", + "aat", + "oao", + "rup", + "taa", + "tavarystva z abmezhavanaj adkaznascyu", + "up", + "zakrytae akcyyanernae tavarystva", + "zao", + "zat", + "аат", + "адкрытае акцыянернае таварыства", + "ва", + "вытворчае аб'яднанне", + "дачэрняя кампанія", + "дп", + "закрытае акцыянернае таварыства", + "закрытое акционерное общество", + "зао", + "зат", + "иностранное общество с ограниченной ответственностью", + "иооо", + "канцэрн", + "оао", + "общество с дополнительной ответственностью", + "общество с ограниченной ответственностью", + "одо", + "ооо", + "открытое акционерное общество", + "республиканское унитарное предприятие", + "руп", + "совместное общество с ограниченной ответственностью", + "сооо", + "таа", + "таварыства з абмежаванай адказнасцю", + "таварыства з дадатковай адказнасцю", + "тда", + "тк", + "транснацыяльны канцэрн", + "унитарное предприятие", + "унітарнае прадпрыемства", + "уп", + "фпг", + "фінансово-промислова група", + "холдынг", + "частное предприятие", + "частное производственно торговое унитарное предприятие", + "частное транспортное унитарное предприятие", + "частное унитарное предприятие", + "частное унитарное предприятие", + "чп", + "чтуп", + "чуп", + ], + "Belgium": [ + "bvba", + "comm.v", + "cvba", + "cvoa", + "esv", + "gie", + "sca", + "scs", + "sep", + "snc", + "sprl", + "vof", + "vzw", + ], + "Bosnia / Herzegovina": [ + "a.d.", + "d.d.", + "d.n.o.", + "d.o.o.", + "k.v.", + "s.p.", + ], + "Brazil": [ + "ad", + "adsitz", + "ead", + "et", + "kd", + "kda", + "ltda", + "pllc", + "s.a.", + "sd", + ], + "Bulgaria": [ + "ad", + "adsits", + "adsitz", + "aktsionerno drujestvo", + "aktsionerno druzhestvo sus spetsialna investitsionna tsel", + "drujestvo s ogranichena otgovornost", + "ead", + "ednolichen turgovetz", + "ednolichno aktsionerno druzhestvo", + "ednolichno druzhestvo s ogranichena otgovornost", + "eood", + "et", + "et", + "kd", + "kd", + "kda", + "kda", + "komanditno druzhestvo", + "komanditno druzhestvo s aktzii", + "ood", + "sd", + "sd", + "subiratelno druzhestvo", + "ад", + "адсиц", + "акционерно дружество", + "акционерно дружество със специална инвестиционна цел", + "дружество с ограничена отговорност", + "еад", + "едноличен търговец", + "едноличен търговец", + "еднолично акционерно дружество", + "еднолично дружество с ограничена отговорност", + "еоод", + "ет", + "кд", + "кда", + "командитно дружество", + "командитно дружество с акции", + "оод", + "сд", + "събирателно дружество", + "частное торговое унитарное предприятие", + ], + "Cambodia": [ + "gp", + "peec", + "plc ltd.", + "pte ltd.", + "sm pte ltd.", + "sp", + ], + "Canada": [ + "gp", + "lp", + "sp", + ], + "Chile": [ + "eirl", + "ltda", + "ltda.", + "s. en c.", + "s.a.", + "s.g.r.", + "s.p.a.", + "sa", + "sgr", + ], + "China": [ + "gǔfèn yǒuxiàn gōngsī)", + "wúxiàn zérèn gōngsī", + "yǒuxiàn gōngsī", + "yǒuxiàn zérèn gǔfèn gōngsī", + "上市公司", + "公司集团", + "公司集團", + "担保有限公司", + "擔保有限公司", + "无限责任公司", + "有限公司", + "有限公司", + "有限合伙", + "有限合夥", + "有限責任公司", + "有限責任股份公司", + "有限责任", + "有限责任公司", + "有限责任股份公司", + "無限責任公司", + "私人公司", + "股份公司", + "股份有限公司", + "集团公司", + "集團公司", + ], + "Columbia": [ + "e.u.", + "s.a.", + "s.a.s.", + "sca", + "suc. de descendants", + ], + "Croatia": [ + "d.d.", + "d.o.o.", + "obrt", + ], + "Czech Republic": [ + "a spol.", + "a.s.", + "akc. spol.", + "k.s.", + "kom. spol.", + "s.r.o.", + "spol. s r. o.", + "spol. s r.o.", + "v.o.s.", + "veř. obch. spol.", + ], + "Denmark": [ + "a.m.b.a.", + "a/s", + "amba", + "f.m.b.a.", + "fmba", + "g/s", + "i/s", + "k/s", + "p/s", + "s.m.b.a.", + "smba", + ], + "Dominican Republic": [ + "c. por a.", + "cxa", + "eirl.", + "s.a.", + "s.a.s.", + "sa", + "sas", + "srl", + "srl.", + ], + "Ecuador": [ + "c.a.", + "ep", + "s.a.", + "sa", + ], + "Egypt": [ + "sae", + ], + "Estonia": [ + "as", + "fie", + "oü", + ], + "Finland": [ + "as oy", + "as.oy", + "ay", + "ky", + "ok", + "oy", + "oyj", + "t:mi", + "tmi", + ], + "France": [ + "ei", + "eurl", + "fcp", + "gie", + "sarl", + "sas", + "sasu", + "sca", + "sci", + "scop", + "scs", + "sem", + "sep", + "sicav", + "snc", + "societe civile immobiliere", + "sogepa", + ], + "Germany": [ + "ag", + "aktiengesellschaft", + "e.g.", + "e.v.", + "egmbh", + "eingetragene genossenschaft mit beschränkter haftpflicht", + "g.m.b.h.", + "gbr", + "gesellschaft mit beschränkter haftung", + "gmbh", + "gmbh & co. kg", + "kg", + "kgaa", + "kommanditgesellschaft", + "mbh & co. kg", + "ohg", + "partg", + ], + "Greece": [ + "Anónimi Etaireía", + "Anónimi Viomichanikí Emborikí Etaireía", + "Etaireía Periorisménis Efthínis", + "Eterórrithmi Etaireía", + "I.K.E.", + "Idiotiki Kefalaiouchiki Etaireía", + "Monoprósopi Etaireía Periorisménis Efthínis", + "Omórrithmi Etaireía", + "Omórrithmi Viomichanikí Emborikí Etaireía", + "a.e.", + "ae", + "atomikís epicheírisis", + "e.e.", + "e.p.e.", + "ee", + "epe", + "m.e.p.e.", + "mepe", + "o.e.", + "o.v.e.e.", + "oe", + "ovee", + "Ανώνυμη Βιομηχανική Εμπορική Εταιρεία", + "Ανώνυμη Εταιρεία", + "Ε.Π.Ε.", + "Εταιρεία Περιορισμένης Ευθύνης", + "Ετερόρρυθμη Εταιρία", + "Ιδιωτική Κεφαλαιουχική Εταιρεία", + "Μ.Ε.Π.Ε.", + "Μονοπρόσωπη Ε.Π.Ε.", + "Ο.Β.Ε.Ε.", + "Ο.Ε.", + "Ομόρρυθμη Βιομηχανική Εμπορική Εταιρεία", + "Ομόρρυθμη Εταιρεία", + "ατομικής επιχείρησης", + ], + "Guatemala": [ + "s.a.", + "sa", + ], + "Haiti": [ + "sa", + ], + "Hong Kong": [ + "limited", + "ltd", + "ultd", + "unltd", + ], + "Hungary": [ + "bt.", + "e.c.", + "e.v.", + "ec", + "ev", + "k.v.", + "kft.", + "kht.", + "kkt.", + "nyrt", + "rt.", + "zrt.", + ], + "Iceland": [ + "ehf.", + "hf.", + "ohf.", + "s.f.", + "ses.", + ], + "India": [ + "ltd.", + "pse", + "psu", + "pvt. ltd.", + ], + "Indonesia": [ + "fa", + "pt", + "ud", + ], + "Ireland": [ + "cpt", + "teo", + ], + "Israel": [ + "b.m.", + "bm", + "limited", + "ltd", + ], + "Italy": [ + "s.a.p.a.", + "s.a.s.", + "s.c.r.l.", + "s.n.c.", + "s.p.a.", + "s.r.l.", + "s.s.", + ], + "Japan": [ + "g.k.", + "gk", + "k.k.", + "y.k.", + ], + "Latvia": [ + "as", + "ik", + "ks", + "ps", + "sia", + ], + "Lebanon": [ + "sal", + ], + "Lithuania": [ + "ab", + "ij", + "mb", + "uab", + ], + "Luxembourg": [ + "s.a.", + "s.a.r.l.", + "secs", + ], + "Macedonia": [ + "a.d.", + "d.o.o.", + "d.o.o.e.l", + "j.t.d.", + "k.d.", + "k.d.a.", + ], + "Malaysia": [ + "bhd.", + "sdn bhd", + "sdn. bhd.", + ], + "Mexico": [ + "s. de. r.l.", + "s. en c.", + "s.a.", + "s.a. de c.v.", + "s.a.b.", + "s.a.p.i.", + ], + "Moldova": [ + "societatea cu raspundere limitata", + ], + "Mongolia": [ + "xk", + "xxk", + ], + "Montenegro": [ + "A.D.", + "Akcionarsko Društvo", + "D.O.O.", + "Društvo sa Ograničenom Odgovornošću", + "K.D.", + "Komanditno Društvo", + "O.D.", + "Ortačko Društvo", + "Preduzetnik", + ], + "Netherlands": [ + "b.v.", + "c.v.", + "n.v.", + "stichting", + "v.o.f.", + ], + "New Zealand": [ + "limited", + "ltd", + "tapui", + ], + "Nigeria": [ + "gte.", + "ltd.", + "plc", + "ultd.", + ], + "Norway": [ + "ans", + "as", + "asa", + "ba", + "bl", + "da", + "etat", + "fkf", + "hf", + "iks", + "kf", + "ks", + "nuf", + "rhf", + "sf", + ], + "Oman": [ + "saoc", + "saog", + ], + "Pakistan": [ + "limited", + "ltd", + "ltd.", + "pvt. ltd.", + ], + "Peru": [ + "s.a.", + "s.a.a.", + "sa", + ], + "Philippines": [ + "coop.", + "corp", + "corp.", + "ent.", + "inc", + "inc.", + "l.l.c.", + "llc", + "ltd.", + ], + "Poland": [ + "p.p.", + "s.a.", + "s.c.", + "s.k.a.", + "sp. z.o.o.", + "sp.j.", + "sp.k.", + "sp.p.", + "spółka z ograniczoną odpowiedzialnością", + ], + "Portugal": [ + "crl", + "lda.", + "s.a.", + "s.f.", + "sgps", + ], + "Romania": [ + "o.n.g.", + "organizație non-guvernamentală", + "persoana fizica autorizata", + "pfa", + "s.a.", + "s.c.a.", + "s.c.s.", + "s.n.c.", + "s.r.l.", + "societate cu răspundere limitată", + "societatea cu răspundere limitată cu proprietar unic", + "societatea pe acțiuni", + "societatea în comandită pe acțiuni", + "societatea în comandită simplă", + "societatea în nume colectiv", + "societăți de persoane", + "srl cu proprietar unic", + "întreprindere individuală", + "întreprinderea individuală", + ], + "Russia": [ + "3ao", + "akcionernoe obshhestvo", + "ao", + "fgup", + "fkp", + "gk", + "gruppa kompanij", + "kao", + "oao", + "ooo", + "pao", + "zakrytoe akcionernoe obshhestvo", + "zao", + "zao", + "автономная некоммерческая организация", + "акционерное общество", + "ано", + "ао", + "артель", + "благотворительное учреждение", + "благотворительный фонд", + "бф", + "ган", + "гау", + "гбу", + "гк", + "гку", + "государственная академия наук", + "государственная компания", + "государственная корпорация", + "государственное автономное учреждение", + "государственное бюджетное учреждение", + "государственное казенное учреждение", + "государственное унитарное предприятие", + "группа компаний", + "гупс", + "закрытое акционерное общество", + "зао", + "индивидуальный предприниматель", + "интегрированная структура", + "ип", + "казенное предприятие", + "колхоз", + "коммандитное товарищество", + "кооперативное хозяйство", + "коопхоз", + "кп", + "крестьянское хозяйство", + "кх", + "мау", + "мбу", + "межправительственная международная организация", + "мкп", + "мку", + "ммо", + "муниципальное автономное учреждение", + "муниципальное бюджетное учреждение", + "муниципальное казенное предприятие", + "муниципальное казенное учреждение", + "муниципальное унитарное предприятие", + "муп", + "негосударственный пенсионный фонд", + "неправительственная международная организация", + "нмо", + "нпф", + "оао", + "оао", + "общественное учреждение", + "общественный фонд", + "общество с ограниченной ответственностью", + "оинно", + "ооо", + "ооо", + "отделение иностранной некоммерческой неправительственной организации", + "открытое акционерное общество", + "оу", + "оф", + "пао", + "пао", + "пк", + "полное товарищество", + "ппк", + "производственный кооператив", + "пт", + "публично-правовая компания", + "публичное акционерное общество", + "ра", + "ран", + "религиозная организация", + "ро", + "российская академия наук", + "рыболовецкая артель", + "са", + "сельскохозяйственная артель", + "сельскохозяйственный производственный кооператив", + "спк", + "товарищество на вере", + "унитарное предприятие", + "уп", + "учреждение", + "фгау", + "фгбу", + "фгку", + "фгуп", + "федеральное государственное автономное учреждение", + "федеральное государственное бюджетное учреждение", + "федеральное государственное казенное учреждение", + "федеральное государственное унитарное предприятие", + "федеральное казенное предприятие", + "фермерское хозяйство", + "фкп", + "фонд", + "фх", + "хк", + "хо", + "хозяйственное общество", + "хозяйственное партнерство", + "хозяйственное товарищество", + "холдинговая компания", + "хп", + "хт", + "частное учреждение", + "чу", + "экологический фонд", + "эф", + ], + "Serbia": [ + "a.d.", + "d.o.o.", + "k.d.", + "o.d.", + ], + "Singapore": [ + "bhd", + "l.l.p.", + "llp", + "ltd.", + "pte", + "pte ltd", + "pte. ltd.", + "sdn bhd", + ], + "Slovakia": [ + "a spol.", + "a.s.", + "akc. spol.", + "k.s.", + "kom. spol.", + "s.r.o.", + "spol. s r. o.", + "spol. s r.o.", + "v.o.s.", + ], + "Slovenia": [ + "d.d.", + "d.n.o.", + "d.o.o.", + "k.d.", + "s.p.", + ], + "Spain": [ + "s.a.", + "s.a.d.", + "s.c.", + "s.coop", + "s.cra", + "s.l.", + "s.l.l.", + "s.l.n.e.", + "sal", + "sccl", + ], + "Sweden": [ + "ab", + "hb", + "kb", + ], + "Switzerland": [ + "ab", + "g.m.b.h.", + "gmbh", + "sa", + "sagl", + "sarl", + ], + "Turkey": [ + "koop.", + ], + "Ukraine": [ + "aktsionerne tovarystvo", + "at", + "at", + "dat", + "derzhavne aktsionerne tovarystvo", + "dochp", + "dp", + "fizychna osoba pidpryyemets", + "fop", + "ip", + "komandytne tovarystvo", + "kt", + "mpp", + "pat", + "povne tovarystvo", + "pp", + "prat", + "prytvatne aktsionerne tovarystvo", + "pryvatne pidpryyemstvo", + "pt", + "publichne aktsionerne tovarystvo", + "pzii", + "sp", + "tdv", + "tob", + "tov", + "vat", + "vidkryte aktsionerne tovarystvo", + "zakryte aktsionerne tovarystvo", + "zat", + "аб", + "акціонерне товариство", + "ао", + "ат", + "ат", + "бо", + "вiдкрите акцiонерне товариство", + "ват", + "ват", + "вк", + "вкксу", + "гк", + "го", + "громадська організація", + "гс", + "даз", + "дат", + "державне акціонерне товариство", + "до", + "дочп", + "дп", + "жбк", + "закрите акцiонерне товариство", + "зат", + "зат", + "зіі", + "кб", + "ко", + "командитне товариство", + "кп", + "кс", + "кт", + "кт", + "мпп", + "нпф", + "оас", + "одв", + "ок", + "омс", + "оо", + "оог", + "оп", + "ор", + "осбб", + "осн", + "осс", + "пt", + "пат", + "пат", + "пзіі", + "по", + "повне товариство", + "пог", + "пп", + "пп", + "прат", + "прат", + "приватне акціонерне товариство", + "приватне підприємство", + "пск", + "пт", + "публічне акціонерне товариство", + "пфо", + "піі", + "релігійна організація", + "ро", + "свк", + "ск", + "сок", + "сп", + "сп у формі тов", + "спд", + "сс", + "сст", + "ст", + "тб", + "тдв", + "тдв", + "тзов", + "тов", + "тс", + "фб", + "фг", + "фоп", + "фоп", + "фізична особа підприємець", + "хк", + "іп", + ], + "United Kingdom": [ + "cic", + "cio", + "l.l.p.", + "l.p.", + "limited", + "llp", + "lp", + "ltd", + "ltd.", + "plc", + "plc.", + ], + "United States of America": [ + "& co", + "& co.", + "& company", + "and company", + "co", + "co.", + "company", + "corp", + "corp.", + "corporation", + "inc", + "inc.", + "incorporated", + "l.l.p.", + "limited", + "llc", + "llp", + "lp", + "ltd", + "ltd.", + "pllc", + ], + "Uzbekistan": [ + "aj", + "mchj", + "oaj", + "ok", + "qk", + "qmj", + "uk", + "xk", + "xt", + "yoaj", + ], } diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..3d74b08 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,306 @@ +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. + +[[package]] +name = "cachetools" +version = "5.5.2" +description = "Extensible memoizing collections and decorators" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a"}, + {file = "cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4"}, +] + +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "distlib" +version = "0.4.0" +description = "Distribution utilities" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16"}, + {file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.3.1" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version < \"3.11\"" +files = [ + {file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"}, + {file = "exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""} + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "filelock" +version = "3.16.1" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"}, + {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"] +typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] + +[[package]] +name = "iniconfig" +version = "2.1.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, + {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, +] + +[[package]] +name = "packaging" +version = "25.0" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, + {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, +] + +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pyproject-api" +version = "1.8.0" +description = "API to interact with the python pyproject.toml based projects" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pyproject_api-1.8.0-py3-none-any.whl", hash = "sha256:3d7d347a047afe796fd5d1885b1e391ba29be7169bd2f102fcd378f04273d228"}, + {file = "pyproject_api-1.8.0.tar.gz", hash = "sha256:77b8049f2feb5d33eefcc21b57f1e279636277a8ac8ad6b5871037b243778496"}, +] + +[package.dependencies] +packaging = ">=24.1" +tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["furo (>=2024.8.6)", "sphinx-autodoc-typehints (>=2.4.1)"] +testing = ["covdefaults (>=2.3)", "pytest (>=8.3.3)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "setuptools (>=75.1)"] + +[[package]] +name = "pytest" +version = "7.4.4" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, + {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "tomli" +version = "2.3.0" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version < \"3.11\"" +files = [ + {file = "tomli-2.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:88bd15eb972f3664f5ed4b57c1634a97153b4bac4479dcb6a495f41921eb7f45"}, + {file = "tomli-2.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:883b1c0d6398a6a9d29b508c331fa56adbcdff647f6ace4dfca0f50e90dfd0ba"}, + {file = "tomli-2.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1381caf13ab9f300e30dd8feadb3de072aeb86f1d34a8569453ff32a7dea4bf"}, + {file = "tomli-2.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0e285d2649b78c0d9027570d4da3425bdb49830a6156121360b3f8511ea3441"}, + {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0a154a9ae14bfcf5d8917a59b51ffd5a3ac1fd149b71b47a3a104ca4edcfa845"}, + {file = "tomli-2.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:74bf8464ff93e413514fefd2be591c3b0b23231a77f901db1eb30d6f712fc42c"}, + {file = "tomli-2.3.0-cp311-cp311-win32.whl", hash = "sha256:00b5f5d95bbfc7d12f91ad8c593a1659b6387b43f054104cda404be6bda62456"}, + {file = "tomli-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:4dc4ce8483a5d429ab602f111a93a6ab1ed425eae3122032db7e9acf449451be"}, + {file = "tomli-2.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7d86942e56ded512a594786a5ba0a5e521d02529b3826e7761a05138341a2ac"}, + {file = "tomli-2.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:73ee0b47d4dad1c5e996e3cd33b8a76a50167ae5f96a2607cbe8cc773506ab22"}, + {file = "tomli-2.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:792262b94d5d0a466afb5bc63c7daa9d75520110971ee269152083270998316f"}, + {file = "tomli-2.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f195fe57ecceac95a66a75ac24d9d5fbc98ef0962e09b2eddec5d39375aae52"}, + {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e31d432427dcbf4d86958c184b9bfd1e96b5b71f8eb17e6d02531f434fd335b8"}, + {file = "tomli-2.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b0882799624980785240ab732537fcfc372601015c00f7fc367c55308c186f6"}, + {file = "tomli-2.3.0-cp312-cp312-win32.whl", hash = "sha256:ff72b71b5d10d22ecb084d345fc26f42b5143c5533db5e2eaba7d2d335358876"}, + {file = "tomli-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:1cb4ed918939151a03f33d4242ccd0aa5f11b3547d0cf30f7c74a408a5b99878"}, + {file = "tomli-2.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5192f562738228945d7b13d4930baffda67b69425a7f0da96d360b0a3888136b"}, + {file = "tomli-2.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:be71c93a63d738597996be9528f4abe628d1adf5e6eb11607bc8fe1a510b5dae"}, + {file = "tomli-2.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4665508bcbac83a31ff8ab08f424b665200c0e1e645d2bd9ab3d3e557b6185b"}, + {file = "tomli-2.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4021923f97266babc6ccab9f5068642a0095faa0a51a246a6a02fccbb3514eaf"}, + {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4ea38c40145a357d513bffad0ed869f13c1773716cf71ccaa83b0fa0cc4e42f"}, + {file = "tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05"}, + {file = "tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606"}, + {file = "tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999"}, + {file = "tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e"}, + {file = "tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3"}, + {file = "tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc"}, + {file = "tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0"}, + {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879"}, + {file = "tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005"}, + {file = "tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463"}, + {file = "tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8"}, + {file = "tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77"}, + {file = "tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf"}, + {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530"}, + {file = "tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b"}, + {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67"}, + {file = "tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f"}, + {file = "tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0"}, + {file = "tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba"}, + {file = "tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b"}, + {file = "tomli-2.3.0.tar.gz", hash = "sha256:64be704a875d2a59753d80ee8a533c3fe183e3f06807ff7dc2232938ccb01549"}, +] + +[[package]] +name = "tox" +version = "4.25.0" +description = "tox is a generic virtualenv management and test command line tool" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "tox-4.25.0-py3-none-any.whl", hash = "sha256:4dfdc7ba2cc6fdc6688dde1b21e7b46ff6c41795fb54586c91a3533317b5255c"}, + {file = "tox-4.25.0.tar.gz", hash = "sha256:dd67f030317b80722cf52b246ff42aafd3ed27ddf331c415612d084304cf5e52"}, +] + +[package.dependencies] +cachetools = ">=5.5.1" +chardet = ">=5.2" +colorama = ">=0.4.6" +filelock = ">=3.16.1" +packaging = ">=24.2" +platformdirs = ">=4.3.6" +pluggy = ">=1.5" +pyproject-api = ">=1.8" +tomli = {version = ">=2.2.1", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.12.2", markers = "python_version < \"3.11\""} +virtualenv = ">=20.29.1" + +[package.extras] +test = ["devpi-process (>=1.0.2)", "pytest (>=8.3.4)", "pytest-mock (>=3.14)"] + +[[package]] +name = "typing-extensions" +version = "4.13.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version < \"3.11\"" +files = [ + {file = "typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c"}, + {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"}, +] + +[[package]] +name = "virtualenv" +version = "20.35.4" +description = "Virtual Python Environment builder" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "virtualenv-20.35.4-py3-none-any.whl", hash = "sha256:c21c9cede36c9753eeade68ba7d523529f228a403463376cf821eaae2b650f1b"}, + {file = "virtualenv-20.35.4.tar.gz", hash = "sha256:643d3914d73d3eeb0c552cbb12d7e82adf0e504dbf86a3182f8771a153a1971c"}, +] + +[package.dependencies] +distlib = ">=0.3.7,<1" +filelock = ">=3.12.2,<4" +platformdirs = ">=3.9.1,<5" +typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] + +[metadata] +lock-version = "2.1" +python-versions = "^3.8" +content-hash = "0ebc541580000b5fbae4f281d2343408fc192046d1947051613ece25cff9a619" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c02777c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[tool.poetry] +name = "cleanco" +version = "2.4-dev" +description = "Python library to process company names" +authors = ["Paul Solin "] +license = "MIT" +readme = "README.md" +homepage = "https://github.com/psolin/cleanco" +repository = "https://github.com/psolin/cleanco" +classifiers = [ + "Topic :: Office/Business", + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", +] +packages = [{ include = "cleanco" }] + +[tool.poetry.dependencies] +python = "^3.8" + +[tool.poetry.group.dev.dependencies] +pytest = "^7.0" +tox = "^4.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[[tool.poetry.source]] +name = "pypi" +priority = "primary" + +[tool.black] +line-length = 100 +target-version = ["py311"] + +[tool.isort] +profile = "black" +src_paths = ["abzu", "tests"] + +[tool.pytest.ini_options] +testpaths = ["tests"] + +[tool.tox] +legacy_tox_ini = """ +[tox] +envlist = py38, py39, py310, py311, py312, py313 +isolated_build = true + +[testenv] +deps = pytest +commands = pytest +""" diff --git a/setup.py b/setup.py deleted file mode 100755 index 92e1072..0000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python - -import setuptools - -from setuptools import setup - -with open('README.md', encoding='utf-8') as f: - long_description = f.read() - -setup(name='cleanco', - description='Python library to process company names', - long_description=long_description, - long_description_content_type='text/markdown', - version='2.4-dev', - license="MIT", - license_files=('LICENSE.txt',), - classifiers = [ - "Topic :: Office/Business", - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3" - ], - url='https://github.com/psolin/cleanco', - author='Paul Solin', - author_email='paul@paulsolin.com', - packages=["cleanco"], - setup_requires=['pytest-runner'], - tests_require=['pytest', 'tox'], -) diff --git a/tests/test_cleanname.py b/tests/test_cleanname.py index 1386973..753fed6 100644 --- a/tests/test_cleanname.py +++ b/tests/test_cleanname.py @@ -1,109 +1,591 @@ # encoding: utf-8 +""" +Comprehensive test suite for cleanco business name cleaning functionality. + +Test Categories: +1. Core Functionality Tests +2. Edge Case Tests +3. Unicode and Internationalization Tests +""" + import pytest from cleanco import basename +# ============================================================================ +# 1. CORE FUNCTIONALITY TESTS +# ============================================================================ + def test_deterministic_terms(monkeypatch): - """prepare_default_terms should always return the same list (even for different ordering in get_unique_terms)""" - from cleanco import clean - with monkeypatch.context() as m: - mock_terms = ["aaa", "bbb", "ccc"] - m.setattr(clean, "get_unique_terms", lambda: mock_terms) - res1 = clean.prepare_default_terms() - m.setattr(clean, "get_unique_terms", lambda: reversed(mock_terms)) - res2 = clean.prepare_default_terms() - assert res1 == res2 + """ + 1.1 Test deterministic term preparation + Ensures prepare_default_terms() returns consistent results regardless + of the ordering of terms returned by get_unique_terms(). + """ + from cleanco import clean + with monkeypatch.context() as m: + mock_terms = ["aaa", "bbb", "ccc"] + m.setattr(clean, "get_unique_terms", lambda: mock_terms) + res1 = clean.prepare_default_terms() + m.setattr(clean, "get_unique_terms", lambda: reversed(mock_terms)) + res2 = clean.prepare_default_terms() + assert res1 == res2 -# Tests that demonstrate stuff is stripped away basic_cleanup_tests = { - "name w/ suffix": "Hello World Oy", - "name w/ ', ltd.'": "Hello World, ltd.", - "name w/ ws suffix ws": "Hello World ltd", - "name w/ suffix ws": "Hello World ltd ", - "name w/ suffix dot ws": "Hello World ltd. ", - "name w/ ws suffix dot ws": " Hello World ltd. ", + "name w/ suffix": "Hello World Oy", + "name w/ ', ltd.'": "Hello World, ltd.", + "name w/ ws suffix ws": "Hello World ltd", + "name w/ suffix ws": "Hello World ltd ", + "name w/ suffix dot ws": "Hello World ltd. ", + "name w/ ws suffix dot ws": " Hello World ltd. ", } def test_basic_cleanups(): - expected = "Hello World" - errmsg = "cleanup of %s failed" - for testname, variation in basic_cleanup_tests.items(): - assert basename(variation) == expected, errmsg % testname + """ + 1.2 Test basic suffix removal + + Tests that common legal suffixes are properly identified and removed + with various whitespace and punctuation configurations. + + Sub-tests: + 1.2.1 - Simple suffix removal + 1.2.2 - Suffix with comma before + 1.2.3 - Multiple whitespace handling + 1.2.4 - Trailing whitespace handling + 1.2.5 - Suffix with period and trailing space + 1.2.6 - Leading and trailing whitespace + """ + expected = "Hello World" + errmsg = "cleanup of %s failed" + for testname, variation in basic_cleanup_tests.items(): + assert basename(variation) == expected, errmsg % testname + multi_cleanup_tests = { - "name + suffix": "Hello World Oy", - "name + suffix (without punct)": "Hello World sro", - "prefix + name": "Oy Hello World", - "prefix + name + suffix": "Oy Hello World Ab", - "name w/ term in middle": "Hello Oy World", - "name w/ complex term in middle": "Hello pty ltd World", - "name w/ mid + suffix": "Hello Oy World Ab" + "name + suffix": "Hello World Oy", + "name + suffix (without punct)": "Hello World sro", + "prefix + name": "Oy Hello World", + "prefix + name + suffix": "Oy Hello World Ab", + "name w/ term in middle": "Hello Oy World", + "name w/ complex term in middle": "Hello pty ltd World", + "name w/ mid + suffix": "Hello Oy World Ab" } def test_multi_type_cleanups(): - expected = "Hello World" - errmsg = "cleanup of %s failed" - for testname, variation in multi_cleanup_tests.items(): - result = basename(variation, prefix=True, suffix=True, middle=True) - assert result == expected, errmsg % testname + """ + 1.3 Test prefix, suffix, and middle term removal + + Tests removal of legal terms from various positions when + prefix=True, suffix=True, and middle=True are enabled. + Sub-tests: + 1.3.1 - Suffix-only removal + 1.3.2 - Suffix without punctuation + 1.3.3 - Prefix-only removal + 1.3.4 - Prefix and suffix removal + 1.3.5 - Middle term removal + 1.3.6 - Complex middle term removal (compound terms) + 1.3.7 - Middle and suffix removal + """ + expected = "Hello World" + errmsg = "cleanup of %s failed" + for testname, variation in multi_cleanup_tests.items(): + result = basename(variation, prefix=True, suffix=True, middle=True) + assert result == expected, errmsg % testname -# Tests that demonstrate basename can be run twice effectively double_cleanup_tests = { - "name + two prefix": "Ab Oy Hello World", - "name + two suffix": "Hello World Ab Oy", - "name + two in middle": "Hello Ab Oy World" + "name + two prefix": "Ab Oy Hello World", + "name + two suffix": "Hello World Ab Oy", + "name + two in middle": "Hello Ab Oy World" } def test_double_cleanups(): - expected = "Hello World" - errmsg = "cleanup of %s failed" - for testname, variation in multi_cleanup_tests.items(): - result = basename(variation, prefix=True, suffix=True, middle=True) - final = basename(result, prefix=True, suffix=True, middle=True) + """ + 1.4 Test iterative cleaning (running basename twice) - assert final == expected, errmsg % testname + Verifies that running basename() multiple times can remove + multiple legal terms that weren't caught in a single pass. + + Sub-tests: + 1.4.1 - Two prefixes requiring double pass + 1.4.2 - Two suffixes requiring double pass + 1.4.3 - Two middle terms requiring double pass + """ + expected = "Hello World" + errmsg = "cleanup of %s failed" + for testname, variation in multi_cleanup_tests.items(): + result = basename(variation, prefix=True, suffix=True, middle=True) + final = basename(result, prefix=True, suffix=True, middle=True) + assert final == expected, errmsg % testname -# Tests that demonstrate organization name is kept intact preserving_cleanup_tests = { - "name with comma": ("Hello, World, ltd.", "Hello, World"), - "name with dot": ("Hello. World, Oy", "Hello. World") + "name with comma": ("Hello, World, ltd.", "Hello, World"), + "name with dot": ("Hello. World, Oy", "Hello. World") } def test_preserving_cleanups(): - errmsg = "preserving cleanup of %s failed" - for testname, (variation, expected) in preserving_cleanup_tests.items(): - assert basename(variation) == expected, errmsg % testname + """ + 1.5 Test punctuation preservation in company names -# Test umlauts + Ensures that punctuation that is part of the actual company name + (commas, periods) is preserved while legal suffixes are removed. + Sub-tests: + 1.5.1 - Comma in company name preserved + 1.5.2 - Period in company name preserved + """ + errmsg = "preserving cleanup of %s failed" + for testname, (variation, expected) in preserving_cleanup_tests.items(): + assert basename(variation) == expected, errmsg % testname -unicode_umlaut_tests = { - "name with umlaut in end": ("Säätämö Oy", "Säätämö"), - "name with umlauts & comma": ("Säätämö, Oy", "Säätämö"), - "name with no ending umlaut": ("Säätämo Oy", "Säätämo"), - "name with beginning umlaut": ("Äätämo Oy", "Äätämo"), - "name with just umlauts": ("Äätämö", "Äätämö"), - "cyrillic name": ("ОАО Новороссийский морской торговый порт", "Новороссийский морской торговый порт") +unicode_umlaut_tests = { + "name with umlaut in end": ("Säätämö Oy", "Säätämö"), + "name with umlauts & comma": ("Säätämö, Oy", "Säätämö"), + "name with no ending umlaut": ("Säätämo Oy", "Säätämo"), + "name with beginning umlaut": ("Äätämo Oy", "Äätämo"), + "name with just umlauts": ("Äätämö", "Äätämö"), + "cyrillic name": ("ОАО Новороссийский морской торговый порт", "Новороссийский морской торговый порт") } def test_with_unicode_umlauted_name(): - errmsg = "preserving cleanup of %s failed" - for testname, (variation, expected) in unicode_umlaut_tests.items(): - assert basename(variation, prefix=True) == expected, errmsg % testname + """ + 1.6 Test Finnish/Nordic umlaut handling + + Tests that company names with Nordic umlauts (ä, ö) and Cyrillic + characters are handled correctly with prefix removal. + + Sub-tests: + 1.6.1 - Umlaut at end of name + 1.6.2 - Umlauts with comma punctuation + 1.6.3 - Umlaut in middle of name + 1.6.4 - Umlaut at beginning of name + 1.6.5 - Name consisting only of umlauts + 1.6.6 - Cyrillic script with prefix removal + """ + errmsg = "preserving cleanup of %s failed" + for testname, (variation, expected) in unicode_umlaut_tests.items(): + assert basename(variation, prefix=True) == expected, errmsg % testname terms_with_accents_tests = { - "term with ł correct spelling": ("Łoś spółka z o.o", "Łoś"), - "term with ł incorrect spelling": ("Łoś spolka z o.o", "Łoś"), + "term with ł correct spelling": ("Łoś spółka z o.o", "Łoś"), + "term with ł incorrect spelling": ("Łoś spolka z o.o", "Łoś"), } def test_terms_with_accents(): - errmsg = "preserving cleanup of %s failed" - for testname, (variation, expected) in terms_with_accents_tests.items(): - assert basename(variation, suffix=True) == expected, errmsg % testname + """ + 1.7 Test Polish accented term matching + + Verifies that Polish legal terms with special characters (ł, ó) + are correctly matched and removed, with normalization handling + different spellings. + + Sub-tests: + 1.7.1 - Correctly spelled Polish term (spółka) + 1.7.2 - Normalized spelling without accents (spolka) + """ + errmsg = "preserving cleanup of %s failed" + for testname, (variation, expected) in terms_with_accents_tests.items(): + assert basename(variation, suffix=True) == expected, errmsg % testname + + +# ============================================================================ +# 2. EDGE CASE TESTS +# ============================================================================ + +empty_result_single_tests = { + "AMBA (Danish term)": "AMBA", + "Inc": "Inc", + "LLC": "LLC", + "Ltd": "Ltd", + "Corporation": "Corporation", + "Limited": "Limited", +} + +def test_empty_result_single_terms(): + """ + 2.1 Test empty results from single legal terms + + Documents current behavior: when input consists only of a single + legal term, the result is an empty string. + + Sub-tests: + 2.1.1 - AMBA (Danish term) + 2.1.2 - Inc + 2.1.3 - LLC + 2.1.4 - Ltd + 2.1.5 - Corporation + 2.1.6 - Limited + """ + expected = '' + errmsg = "empty result test for %s failed" + for testname, variation in empty_result_single_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +empty_result_multiple_tests = { + "inc & co": "inc & co", + "Ltd. & Co.": "Ltd. & Co.", +} + +def test_empty_result_multiple_terms(): + """ + 2.2 Test empty results from multiple legal terms + + When input consists only of multiple legal terms, result is empty. + This edge case is related to test 2.1. + + Sub-tests: + 2.2.1 - "inc & co" returns empty + 2.2.2 - "Ltd. & Co." returns empty + """ + expected = '' + errmsg = "empty result test for %s failed" + for testname, variation in empty_result_multiple_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +punctuation_handling_tests = { + "ampersand (&) preservation": ("Smith & Jones Corporation", "Smith & Jones"), + "hyphen (-) preservation": ("Smith-Jones Ltd", "Smith-Jones"), + "apostrophe (') preservation": ("McDonald's Corporation", "McDonald's"), + "multiple periods (abbreviations)": ("A.B.C. Corp", "A.B.C."), + "trailing exclamation removal": ("Yahoo! Inc", "Yahoo"), +} + +def test_punctuation_handling(): + """ + 2.3 Test various punctuation scenarios + + Tests how different punctuation marks are handled in company names. + + Sub-tests: + 2.3.1 - Ampersand (&) preservation + 2.3.2 - Hyphen (-) preservation + 2.3.3 - Apostrophe (') preservation + 2.3.4 - Multiple periods (abbreviations) + 2.3.5 - Trailing exclamation marks removal + + Note: Test 2.3.5 documents current behavior where trailing exclamation + marks are removed (e.g., "Yahoo! Inc" becomes "Yahoo"). This is inconsistent + with the preservation of other punctuation marks like apostrophes and + ampersands, and may be addressed in a future update. + """ + errmsg = "punctuation handling test for %s failed" + for testname, (variation, expected) in punctuation_handling_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +case_insensitivity_tests = { + "uppercase term (LLC)": ("Company Name LLC", "Company Name"), + "lowercase term (llc)": ("Company Name llc", "Company Name"), + "mixed case term (LlC)": ("Company Name LlC", "Company Name"), +} + +def test_case_insensitivity(): + """ + 2.4 Test case-insensitive term matching + + Verifies that legal terms are matched regardless of case (uppercase, + lowercase, or mixed case). + + Sub-tests: + 2.4.1 - Uppercase term (LLC) + 2.4.2 - Lowercase term (llc) + 2.4.3 - Mixed case term (LlC) + """ + errmsg = "case insensitivity test for %s failed" + for testname, (variation, expected) in case_insensitivity_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +compound_terms_tests = { + "Pty Ltd compound": ("Example Company Pty Ltd", "Example"), + "Pty Limited (known issue)": ("Example Example Pty Limited", "Example Example Pty"), + "entire name is legal terms": ("Company Pvt. Ltd.", ""), +} + +def test_compound_terms(): + """ + 2.5 Test compound legal terms + + Tests handling of multi-word legal designations and documents + known issues. + + Sub-tests: + 2.5.1 - "Pty Ltd" compound (note: 'Company' also removed as it's a term) + 2.5.2 - "Pty Limited" known bug - doesn't work correctly + 2.5.3 - "Pvt. Ltd." where entire name is legal terms + """ + errmsg = "compound terms test for %s failed" + for testname, (variation, expected) in compound_terms_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +real_world_companies_tests = { + "Apple Inc.": ("Apple Inc.", "Apple"), + "Microsoft Corporation": ("Microsoft Corporation", "Microsoft"), + "Berkshire Hathaway Inc.": ("Berkshire Hathaway Inc.", "Berkshire Hathaway"), + "Procter & Gamble Co.": ("Procter & Gamble Co.", "Procter & Gamble"), + "AT&T Inc.": ("AT&T Inc.", "AT&T"), +} + +def test_real_world_companies(): + """ + 2.6 Test real-world company names + + Tests with actual well-known company names to ensure practical + applicability. + + Sub-tests: + 2.6.1 - Apple Inc. + 2.6.2 - Microsoft Corporation + 2.6.3 - Berkshire Hathaway Inc. + 2.6.4 - Procter & Gamble Co. + 2.6.5 - AT&T Inc. + """ + errmsg = "real world company test for %s failed" + for testname, (variation, expected) in real_world_companies_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +numbers_in_names_tests = { + "numbers after name": ("Company 123 Ltd", "Company 123"), + "name only numbers": ("123456 Inc", "123456"), + "mixed alphanumeric": ("ABC123 Corporation", "ABC123"), +} + +def test_numbers_in_names(): + """ + 2.7 Test company names containing numbers + + Verifies that numeric characters in company names are preserved. + + Sub-tests: + 2.7.1 - Numbers after name + 2.7.2 - Name consisting only of numbers + 2.7.3 - Mixed alphanumeric name + """ + errmsg = "numbers in names test for %s failed" + for testname, (variation, expected) in numbers_in_names_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +special_formats_tests = { + "all uppercase": ("IBM CORPORATION", "IBM"), + "periods as abbreviations": ("U.S. Steel Corp.", "U.S. Steel"), + "single letter name": ("X Corporation", "X"), +} + +def test_special_formats(): + """ + 2.8 Test special formatting scenarios + + Tests edge cases with unusual formatting patterns. + + Sub-tests: + 2.8.1 - All uppercase company name + 2.8.2 - Periods used as abbreviations + 2.8.3 - Single letter company name + """ + errmsg = "special formats test for %s failed" + for testname, (variation, expected) in special_formats_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +whitespace_handling_tests = { + "leading whitespace": (" Hello World Ltd", "Hello World"), + "trailing whitespace": ("Hello World Ltd ", "Hello World"), + "whitespace around term only": (" LLC ", ""), +} + +def test_whitespace_handling(): + """ + 2.9 Test whitespace handling + + Verifies that leading and trailing whitespace is properly handled. + + Sub-tests: + 2.9.1 - Leading whitespace + 2.9.2 - Trailing whitespace + 2.9.3 - Whitespace around legal term only + """ + errmsg = "whitespace handling test for %s failed" + for testname, (variation, expected) in whitespace_handling_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +ambiguous_term_tests = { + "Limited in name and suffix": ("Limited Edition Products Ltd", "Limited Edition Products"), +} + +def test_ambiguous_term_in_name(): + """ + 2.10 Test ambiguous cases where legal term appears in actual name + + When a legal term like "Limited" appears as part of the actual + company name, only suffix occurrences should be removed. + + Example: "Limited Edition Products Ltd" should become + "Limited Edition Products" not "Edition Products" + """ + errmsg = "ambiguous term test for %s failed" + for testname, (variation, expected) in ambiguous_term_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +# ============================================================================ +# 3. UNICODE AND INTERNATIONALIZATION TESTS +# ============================================================================ + +unicode_non_latin_scripts_tests = { + "Arabic script": ('شركة المثال المحدودة', 'شركة المثال المحدودة'), + "Hebrew script": ('חברה בע״מ', 'חברה בע״מ'), + "Japanese (Kanji + Hiragana)": ('株式会社サンプル', '株式会社サンプル'), + "Korean (Hangul)": ('삼성전자 주식회사', '삼성전자 주식회사'), + "Thai script": ('บริษัท จำกัด', 'บริษัท จำกัด'), + "Greek alphabet": ('Ελληνική Επιχείρηση', 'Ελληνική Επιχείρηση'), + "Chinese with English suffix": ('北京公司 Ltd', '北京公司'), + "Cyrillic with English suffix": ('Москва Corporation', 'Москва'), + "Japanese with English suffix": ('東京株式会社 Inc.', '東京株式会社'), +} + +def test_unicode_non_latin_scripts(): + """ + 3.1 Test various Unicode and non-Latin scripts + + Tests that the library handles non-Latin scripts correctly, + preserving them when they don't match legal terms. + + Sub-tests: + 3.1.1 - Arabic script + 3.1.2 - Hebrew script + 3.1.3 - Japanese (Kanji + Hiragana) + 3.1.4 - Korean (Hangul) + 3.1.5 - Thai script + 3.1.6 - Greek alphabet + 3.1.7 - Chinese with English suffix + 3.1.8 - Cyrillic with English suffix + 3.1.9 - Japanese with English suffix + """ + errmsg = "unicode non-Latin script test for %s failed" + for testname, (variation, expected) in unicode_non_latin_scripts_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +unicode_special_characters_tests = { + "French accents": ('Société Française Ltd', 'Société Française'), + "Spanish accents": ('Compañía Española S.A.', 'Compañía Española'), + "Portuguese abbreviation": ('Empresa Ltda.', 'Empresa'), + "German umlauts (Müller & Söhne)": ('Müller & Söhne GmbH', 'Müller & Söhne'), + "German umlauts in name": ('Schöne Bücher Ltd', 'Schöne Bücher'), + "Nordic Ø character": ('Ørsted A/S', 'Ørsted'), + "Nordic Å character": ('Åland Corporation', 'Åland'), + "Czech characters": ('Český Krumlov s.r.o.', 'Český Krumlov'), + "Polish Ł character": ('Łódź Spółka', 'Łódź Spółka'), +} + +def test_unicode_special_characters(): + """ + 3.2 Test Unicode special characters and accented Latin scripts + + Tests handling of Latin script with diacritical marks and special + characters from various European languages. + + Sub-tests: + 3.2.1 - French accents (Société) + 3.2.2 - Spanish accents (Compañía) + 3.2.3 - Portuguese abbreviation (Ltda.) + 3.2.4 - German umlauts (Müller, Söhne) + 3.2.5 - German umlauts in name (Schöne Bücher) + 3.2.6 - Nordic Ø character + 3.2.7 - Nordic Å character + 3.2.8 - Czech characters (Český) + 3.2.9 - Polish Ł character + """ + errmsg = "unicode special characters test for %s failed" + for testname, (variation, expected) in unicode_special_characters_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +unicode_mixed_content_tests = { + "mathematical symbol": ('Alpha β Gamma Corp', 'Alpha β Gamma'), + "currency symbol": ('€uro Company Ltd', '€uro Company'), + "combining characters": ('Naïve Café Inc.', 'Naïve Café'), +} + +def test_unicode_mixed_content(): + """ + 3.3 Test mixed Unicode content with special symbols + + Tests handling of Unicode content that includes emojis, mathematical + symbols, currency symbols, and combining characters. + + Sub-tests: + 3.3.1 - Emoji in company name + 3.3.2 - Greek letter (mathematical symbol) + 3.3.3 - Currency symbol (Euro) + 3.3.4 - Combining diacritical marks (ï, é) + """ + # Emoji test handled separately due to different assertion style + result = basename('Tech 🚀 Innovation Ltd') + assert 'Tech' in result and 'Innovation' in result, "emoji test failed" + + errmsg = "unicode mixed content test for %s failed" + for testname, (variation, expected) in unicode_mixed_content_tests.items(): + assert basename(variation) == expected, errmsg % testname + + +unicode_right_to_left_tests = { + "Arabic with English suffix": ('الشركة العربية Ltd', 'الشركة العربية'), + "Hebrew with English suffix": ('החברה העברית Inc.', 'החברה העברית'), +} + +def test_unicode_right_to_left(): + """ + 3.4 Test right-to-left (RTL) script handling + + Tests that RTL scripts (Arabic, Hebrew) are correctly preserved + when combined with LTR English legal suffixes. + + Sub-tests: + 3.4.1 - Arabic with English suffix + 3.4.2 - Hebrew with English suffix + """ + errmsg = "unicode RTL script test for %s failed" + for testname, (variation, expected) in unicode_right_to_left_tests.items(): + result = basename(variation) + assert expected in result, errmsg % testname + + +unicode_normalization_tests = { + "composed form (single char)": ('Café Ltd', 'Café'), + "decomposed form (base + combining)": ('Café Ltd', 'Café'), +} + +def test_unicode_normalization(): + """ + 3.5 Test Unicode normalization handling + + Tests that different Unicode representations of the same visual + character (composed vs decomposed forms) are handled consistently. + + For example, é can be represented as: + - Single character U+00E9 (composed form) + - Two characters U+0065 + U+0301 (e + combining acute accent) + + Both should be normalized and treated equivalently. + + Sub-tests: + 3.5.1 - Composed form (single character) + 3.5.2 - Decomposed form (base + combining character) + """ + errmsg = "unicode normalization test for %s failed" + for testname, (variation, expected_contains) in unicode_normalization_tests.items(): + result = basename(variation) + assert 'Caf' in result, errmsg % testname diff --git a/tox.ini b/tox.ini index 4b54584..9c5e1ae 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,7 @@ [tox] envlist = py38, py39, py310, py311, py312 +isolated_build = true [testenv] -deps=pytest -commands=py.test +deps = pytest +commands = pytest