Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ ROR staff should replace values in [] with valid credential values. External use

3. Index the latest ROR dataset from https://github.com/ror-community/ror-data

docker-compose exec web python manage.py setup v1.0-2022-03-17-ror-data -s 1
docker-compose exec web python manage.py setup v1.0-2022-03-17-ror-data -s 2

*Note: You must specify a dataset that exists in [ror-data](https://github.com/ror-community/ror-data)*

4. <http://localhost:9292/organizations>.
4. <http://localhost:9292/v2/organizations>.

5. Optionally, start other services, such as [ror-app](https://github.com/ror-community/ror-app) (the search UI) or [generate-id](https://github.com/ror-community/generate-id) (middleware microservice)

Expand All @@ -64,9 +64,9 @@ Used in the data deployment process managed in [ror-records](https://github.com/

docker-compose up -d

3. Index the latest v1 ROR dataset from https://github.com/ror-community/ror-data . To index a v2 dataset, see [Indexing v2 data below](#indexing-v2-data)
3. Index the latest ROR dataset from https://github.com/ror-community/ror-data (see [Indexing v2 data](#indexing-v2-data) below):

docker-compose exec web python manage.py setup v1.0-2022-03-17-ror-data -s 1
docker-compose exec web python manage.py setup v1.0-2022-03-17-ror-data -s 2

*Note: You must specify a dataset that exists in [ror-data](https://github.com/ror-community/ror-data)*

Expand All @@ -92,19 +92,17 @@ To delete the existing index, create a new index and index a data dump:

**LOCALHOST:** Run

docker-compose exec web python manage.py setup v1.0-2022-03-17-ror-data -s 1
docker-compose exec web python manage.py setup v1.0-2022-03-17-ror-data -s 2

**DEV/STAGING/PROD:** Access the running ror-api container and run:

python manage.py setup v1.0-2022-03-17-ror-data -s 1
python manage.py setup v1.0-2022-03-17-ror-data -s 2

*Note: You must specify a dataset that exists in [ror-data](https://github.com/ror-community/ror-data)*

#### Indexing v2 data

The `-s` argument specifies which schema version to index. To index a v2 data dump, use `-s 2`. To index both v1 and v2 at the same time, omit the `-s` option.

Note that a v2 formatted JSON file must exist in the zip file for the specified data dump version. Currently, v2 files only exist in [ror-community/ror-data-test](https://github.com/ror-community/ror-data-test). To index a data dump from ror-data-test rather than ror-data, add the `-t` option to the setup command, ex:
The API uses the v2 schema only. Use `-s 2` when indexing a data dump. A v2 formatted JSON file must exist in the zip file for the specified data dump version. Currently, v2 files only exist in [ror-community/ror-data-test](https://github.com/ror-community/ror-data-test). To index a data dump from ror-data-test rather than ror-data, add the `-t` option to the setup command, ex:

python manage.py setup v1.32-2023-09-14-ror-data -s 2 -t

Expand Down
2 changes: 1 addition & 1 deletion rorapi/common/create_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def new_record_from_json(json_input, version):
if not error:
new_record['locations'] = updated_locations
new_record = add_created_last_mod(new_record)
new_ror_id = check_ror_id(version)
new_ror_id = check_ror_id()
print("new ror id: " + new_ror_id)
new_record['id'] = new_ror_id
error, valid_data = validate_record(sort_list_fields(new_record), V2_SCHEMA)
Expand Down
2 changes: 1 addition & 1 deletion rorapi/common/csv_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def update_record_from_csv(csv_data, version):
errors = []
updated_record = None
print("updating record from csv")
existing_org_errors, existing_org = retrieve_organization(csv_data['id'], version)
existing_org_errors, existing_org = retrieve_organization(csv_data['id'])
print(existing_org)
if existing_org is None:
errors.append("No existing record found for ROR ID '{}'".format(csv_data['id']))
Expand Down
7 changes: 2 additions & 5 deletions rorapi/common/es_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,8 @@
class ESQueryBuilder:
"""Elasticsearch query builder class"""

def __init__(self, version):
if version == "v2":
self.search = Search(using=ES7, index=ES_VARS["INDEX_V2"])
else:
self.search = Search(using=ES7, index=ES_VARS["INDEX_V1"])
def __init__(self):
self.search = Search(using=ES7, index=ES_VARS["INDEX_V2"])
self.search = self.search.extra(track_total_hits=True)
self.search = self.search.params(search_type="dfs_query_then_fetch")

Expand Down
78 changes: 29 additions & 49 deletions rorapi/common/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from rorapi.common.models import Errors
from rorapi.common.es_utils import ESQueryBuilder
from rorapi.v1.models import MatchingResult as MatchingResultV1
from rorapi.v2.models import MatchingResult as MatchingResultV2

from collections import namedtuple
Expand Down Expand Up @@ -200,25 +199,16 @@ def get_similarity(aff_sub, cand_name):
return comparfun(aff_sub, cand_name) / 100


def get_score(candidate, aff_sub, countries, version):
def get_score(candidate, aff_sub, countries):
"""Calculate the similarity between the affiliation substring
and the candidate, using all name versions."""
if version == "v2":
country_code = candidate.locations[0].geonames_details.country_code
all_names = [
name["value"] for name in candidate.names if "acronym" not in name["types"]
]
acronyms = [
name["value"] for name in candidate.names if "acronym" in name["types"]
]
else:
country_code = candidate.country.country_code
all_names = (
[candidate.name]
+ [l.label for l in candidate.labels]
+ list(candidate.aliases)
)
acronyms = candidate.acronyms
country_code = candidate.locations[0].geonames_details.country_code
all_names = [
name["value"] for name in candidate.names if "acronym" not in name["types"]
]
acronyms = [
name["value"] for name in candidate.names if "acronym" in name["types"]
]

if countries and to_region(country_code) not in countries:
return 0
Expand All @@ -239,11 +229,11 @@ def get_score(candidate, aff_sub, countries, version):
MatchedOrganization.__new__.__defaults__ = (False, None, None, 0, None)


def match_by_query(text, matching_type, query, countries, version):
def match_by_query(text, matching_type, query, countries):
"""Match affiliation text using specific ES query."""
candidates = query.execute()
scores = [
(candidate, get_score(candidate, text, countries, version))
(candidate, get_score(candidate, text, countries))
for candidate in candidates
]
if not candidates:
Expand All @@ -262,11 +252,10 @@ def match_by_query(text, matching_type, query, countries, version):
return chosen, all_matched


def match_by_type(text, matching_type, countries, version):
def match_by_type(text, matching_type, countries):
"""Match affiliation text using specific matching mode/type."""

fields_v1 = ["name.norm", "aliases.norm", "labels.label.norm"]
fields_v2 = ["names.value.norm"]
fields = ["names.value.norm"]
substrings = []
if matching_type == MATCHING_TYPE_HEURISTICS:
h1 = re.search(r"University of ([^\s]+)", text)
Expand All @@ -289,12 +278,7 @@ def match_by_type(text, matching_type, countries, version):
else:
substrings.append(text)

queries = [ESQueryBuilder(version) for _ in substrings]

if version == "v2":
fields = fields_v2
else:
fields = fields_v1
queries = [ESQueryBuilder() for _ in substrings]

for s, q in zip(substrings, queries):
if matching_type == MATCHING_TYPE_PHRASE:
Expand All @@ -309,7 +293,7 @@ def match_by_type(text, matching_type, countries, version):
q.add_common_query(fields, normalize(text))
queries = [q.get_query() for q in queries]
matched = [
match_by_query(t, matching_type, q, countries, version)
match_by_query(t, matching_type, q, countries)
for t, q in zip(substrings, queries)
]
if not matched:
Expand All @@ -327,16 +311,15 @@ class MatchingNode:
"""Matching node class. Represents a substring of the original affiliation
that potentially could be matched to an organization."""

def __init__(self, text, version):
def __init__(self, text):
self.text = text
self.version = version
self.matched = None
self.all_matched = []

def match(self, countries, min_score):
for matching_type in NODE_MATCHING_TYPES:
chosen, all_matched = match_by_type(
self.text, matching_type, countries, self.version
self.text, matching_type, countries
)
self.all_matched.extend(all_matched)
if self.matched is None:
Expand Down Expand Up @@ -388,20 +371,19 @@ class MatchingGraph:
This prevents matching an organization to a substring and another
organization to the substring's substring."""

def __init__(self, affiliation, version):
def __init__(self, affiliation):
self.nodes = []
self.version = version
self.affiliation = affiliation
affiliation = re.sub("&amp;", "&", affiliation)
affiliation_cleaned = clean_search_string(affiliation)
n = MatchingNode(affiliation_cleaned, self.version)
n = MatchingNode(affiliation_cleaned)
self.nodes.append(n)
for part in [s.strip() for s in re.split("[,;:]", affiliation)]:
part_cleaned = clean_search_string(part)
do_not_match = check_do_not_match(part_cleaned)
# do not perform search if substring exactly matches a country name or ISO code
if do_not_match == False:
n = MatchingNode(part_cleaned, self.version)
n = MatchingNode(part_cleaned)
self.nodes.append(n)

def remove_low_scores(self, min_score):
Expand All @@ -422,7 +404,7 @@ def match(self, countries, min_score):
]:
chosen.append(node.matched)
acr_chosen, acr_all_matched = match_by_type(
self.affiliation, MATCHING_TYPE_ACRONYM, countries, self.version
self.affiliation, MATCHING_TYPE_ACRONYM, countries
)
all_matched.extend(acr_all_matched)
return chosen, all_matched
Expand Down Expand Up @@ -492,33 +474,31 @@ def get_output(chosen, all_matched, active_only):
return sorted(output, key=lambda x: x.score, reverse=True)[:100]


def check_exact_match(affiliation, countries, version):
qb = ESQueryBuilder(version)
def check_exact_match(affiliation, countries):
qb = ESQueryBuilder()
qb.add_string_query('"' + affiliation + '"')
return match_by_query(
affiliation, MATCHING_TYPE_EXACT, qb.get_query(), countries, version
affiliation, MATCHING_TYPE_EXACT, qb.get_query(), countries
)


def match_affiliation(affiliation, active_only, version):
def match_affiliation(affiliation, active_only):
countries = get_countries(affiliation)
exact_chosen, exact_all_matched = check_exact_match(affiliation, countries, version)
exact_chosen, exact_all_matched = check_exact_match(affiliation, countries)
if exact_chosen.score == 1.0:
return get_output(exact_chosen, exact_all_matched, active_only)
else:
graph = MatchingGraph(affiliation, version)
graph = MatchingGraph(affiliation)
chosen, all_matched = graph.match(countries, MIN_CHOSEN_SCORE)
return get_output(chosen, all_matched, active_only)


def match_organizations(params, version):
def match_organizations(params):
if "affiliation" in params:
active_only = True
if "all_status" in params:
if params["all_status"] == "" or params["all_status"].lower() == "true":
active_only = False
matched = match_affiliation(params.get("affiliation"), active_only, version)
if version == "v2":
return None, MatchingResultV2(matched)
return None, MatchingResultV1(matched)
matched = match_affiliation(params.get("affiliation"), active_only)
return None, MatchingResultV2(matched)
return Errors('"affiliation" parameter missing'), None
18 changes: 7 additions & 11 deletions rorapi/common/matching_single_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from rorapi.common.models import Errors
from rorapi.settings import ES7
from rorapi.common.es_utils import ESQueryBuilder
from rorapi.v1.models import MatchingResult as MatchingResultV1
from rorapi.v2.models import MatchingResult as MatchingResultV2

from collections import namedtuple
Expand Down Expand Up @@ -296,23 +295,20 @@ def get_output(chosen, all_matched):
return all_matched


def get_candidates(aff, countries, version):
qb = ESQueryBuilder(version)
def get_candidates(aff, countries):
qb = ESQueryBuilder()
qb.add_affiliation_query(aff, 200)
return match_by_query(aff, qb.get_query(), countries)


def match_affiliation(affiliation, version):
def match_affiliation(affiliation):
countries = get_countries(affiliation)
chosen, all_matched = get_candidates(affiliation, countries, version)
chosen, all_matched = get_candidates(affiliation, countries)
return get_output(chosen, all_matched)


def match_organizations(params, version):
def match_organizations(params):
if "affiliation" in params:
matched = match_affiliation(params.get("affiliation"), version)

if version == "v2":
return None, MatchingResultV2(matched)
return None, MatchingResultV1(matched)
matched = match_affiliation(params.get("affiliation"))
return None, MatchingResultV2(matched)
return Errors(["'affiliation' parameter missing"]), None
Loading
Loading