From 31581c311def9975dd6b4af258b94143e7696f75 Mon Sep 17 00:00:00 2001 From: Brandon Date: Fri, 4 Jun 2021 13:31:48 -0400 Subject: [PATCH 1/4] Isolated parser service. Made other function schemas compatible Isolated parser service. Made other function schemas in other services compatible with parser function schema --- .../functionchurn/schemas/parser.py | 5 +- metrics/loc/loc/schemas/parser.py | 5 +- parser/parser/constants.py | 69 ++ parser/parser/models.py | 1 - parser/parser/parsers/__init__.py | 2 +- parser/parser/parsers/srcmlparser.py | 917 +++++++++++++++++- parser/parser/schemas.py | 56 +- parser/parser/service.py | 29 + 8 files changed, 1069 insertions(+), 15 deletions(-) create mode 100644 parser/parser/constants.py diff --git a/metrics/functionchurn/functionchurn/schemas/parser.py b/metrics/functionchurn/functionchurn/schemas/parser.py index cea3935..547a701 100644 --- a/metrics/functionchurn/functionchurn/schemas/parser.py +++ b/metrics/functionchurn/functionchurn/schemas/parser.py @@ -1,4 +1,4 @@ -from marshmallow import Schema, fields, post_load +from marshmallow import Schema, fields, post_load, EXCLUDE from ..models import Comment, Function, Position, Span @@ -34,6 +34,9 @@ class FunctionSchema(Schema): signature = fields.String() span = fields.Nested(SpanSchema) + class Meta: + unknown = EXCLUDE + @post_load def make_function(self, data, **kwargs): return Function(**data) diff --git a/metrics/loc/loc/schemas/parser.py b/metrics/loc/loc/schemas/parser.py index cea3935..547a701 100644 --- a/metrics/loc/loc/schemas/parser.py +++ b/metrics/loc/loc/schemas/parser.py @@ -1,4 +1,4 @@ -from marshmallow import Schema, fields, post_load +from marshmallow import Schema, fields, post_load, EXCLUDE from ..models import Comment, Function, Position, Span @@ -34,6 +34,9 @@ class FunctionSchema(Schema): signature = fields.String() span = fields.Nested(SpanSchema) + class Meta: + unknown = EXCLUDE + @post_load def make_function(self, data, **kwargs): return Function(**data) diff --git a/parser/parser/constants.py b/parser/parser/constants.py new file mode 100644 index 0000000..042b3cf --- /dev/null +++ b/parser/parser/constants.py @@ -0,0 +1,69 @@ +C_LIB_FUNCTIONS = [ + "abort","abs","acos","asctime","asctime_r", + "asin","assert","atan","atan2","atexit", + "atof","atoi","atol","bsearch","btowc", + "calloc","catclose","catgets","catopen","ceil", + "clearerr","clock","cos","cosh","ctime", + "ctime64","ctime_r","ctime64_r","difftime","difftime64", + "div","erf","erfc","exit","exp", + "fabs","fclose","fdopen","feof","ferror", + "fflush","fgetc","fgetpos","fgets","fgetwc", + "fgetws","fileno","floor","fmod","fopen", + "fprintf","fputc","fputs","fputwc","fputws", + "fread","free","freopen","frexp","fscanf", + "fseek","fsetpos","ftell","fwide","fwprintf", + "fwrite","fwscanf","gamma","getc","getchar", + "getenv","gets","getwc","getwchar","gmtime", + "gmtime64","gmtime_r","gmtime64_r","hypot","isalnum", + "isalpha","isascii","isblank","iscntrl","isdigit", + "isgraph","islower","isprint","ispunct","isspace", + "isupper","iswalnum","iswalpha","iswblank","iswcntrl", + "iswctype","iswdigit","iswgraph","iswlower","iswprint", + "iswpunct","iswspace","iswupper","iswxdigit","isxdigit", + "j0","j1","jn","labs","ldexp", + "ldiv","localeconv","localtime","localtime64","localtime_r", + "localtime64_r","log","log10","longjmp","malloc", + "mblen","mbrlen","mbrtowc","mbsinit","mbsrtowcs", + "mbstowcs","mbtowc","memchr","memcmp","memcpy", + "memmove","memset","mktime","mktime64","modf", + "nextafter","nextafterl","nexttoward","nexttowardl","nl_langinfo", + "perror","pow","printf","putc","putchar", + "putenv","puts","putwc","putwchar","qsort", + "quantexpd32","quantexpd64","quantexpd128","quantized32","quantized64", + "quantized128","samequantumd32","samequantumd64","samequantumd128","raise", + "rand","rand_r","realloc","regcomp","regerror","regexec", + "regfree","remove","rename","rewind","scanf", + "setbuf","setjmp","setlocale","setvbuf","signal", + "sin","sinh","snprintf","sprintf","sqrt", + "srand","sscanf","strcasecmp","strcat","strchr", + "strcmp","strcoll","strcpy","strcspn","strerror", + "strfmon","strftime","strlen","strncasecmp","strncat", + "strncmp","strncpy","strpbrk","strptime","strrchr", + "strspn","strstr","strtod","strtod32","strtod64", + "strtod128","strtof","strtok","strtok_r","strtol", + "strtold","strtoul","strxfrm","swprintf","swscanf", + "system","tan","tanh","time","time64", + "tmpfile","tmpnam","toascii","tolower","toupper", + "towctrans","towlower","towupper","ungetc","ungetwc", + "va_arg","va_copy","va_end","va_start", + "vfprintf","vfscanf","vfwprintf","vfwscanf", + "vprintf","vscanf","vsprintf","vsnprintf", + "vsscanf","vswprintf","vswscanf","vwprintf","vwscanf", + "wcrtomb","wcscat","wcschr","wcscmp","wcscoll", + "wcscpy","wcscspn","wcsftime","wcslen","wcslocaleconv", + "wcsncat","wcsncmp","wcsncpy","wcspbrk","wcsptime", + "wcsrchr","char *ctime64_r(const time64_t *time, char *buf);","wcsspn","wcsstr","wcstod", + "wcstod32","wcstod64","wcstod128","wcstof","wcstok", + "wcstol","wcstold","wcstombs","wcstoul","wcsxfrm", + "wctob","wctomb","wctrans","wctype","wcwidth", + "wmemchr","wmemcmp","wmemcpy","wmemmove","wmemset", + "wprintf","wscanf","y0","y1","yn"] + +C_LIB_STREAMS = ["stderr", "stdout"] +C_RESERVED_KEYWORDS = [ + "auto", "const", "int", "short", "struct", + "unsigned", "double", "float", "break", "continue", + "long", "signed", "switch", "void", "else", "for", "case", + "default", "register", "sizeof", "typedef", "volatile", + "enum", "goto", "char", "do", "return", "static", + "union", "while", "extern", "if"] \ No newline at end of file diff --git a/parser/parser/models.py b/parser/parser/models.py index 897203c..86df356 100644 --- a/parser/parser/models.py +++ b/parser/parser/models.py @@ -1,6 +1,5 @@ import dataclasses - @dataclasses.dataclass(frozen=True) class Position: __slots__ = ['line', 'column'] diff --git a/parser/parser/parsers/__init__.py b/parser/parser/parsers/__init__.py index b83365e..2933f5e 100644 --- a/parser/parser/parsers/__init__.py +++ b/parser/parser/parsers/__init__.py @@ -1,7 +1,7 @@ from .srcmlparser import SrcMLParser from ..exceptions import NoParser -PARSERS = {'C': SrcMLParser, 'C++': SrcMLParser} +PARSERS = {'C': SrcMLParser, 'C++': SrcMLParser, 'Java': SrcMLParser} def get_parser(language): diff --git a/parser/parser/parsers/srcmlparser.py b/parser/parser/parsers/srcmlparser.py index c16ae2f..b153b83 100644 --- a/parser/parser/parsers/srcmlparser.py +++ b/parser/parser/parsers/srcmlparser.py @@ -6,6 +6,7 @@ from ..enumerations import CommentType from ..models import Comment, Function, Position, Span +from ..constants import C_LIB_FUNCTIONS, C_LIB_STREAMS, C_RESERVED_KEYWORDS logger = logging.getLogger(__name__) @@ -19,7 +20,6 @@ def _create_position(line, column): return Position(line=line, column=column) - def _get_comments(srcml): for comment in srcml.iter(f'{{{SRC_NS}}}comment'): begin, end = _get_span(comment) @@ -27,7 +27,6 @@ def _get_comments(srcml): begin, end = _create_position(*begin), _create_position(*end) yield Comment(type=type_, span=Span(begin=begin, end=end)) - def _get_declarations(srcml): for function in srcml.iter(f'{{{SRC_NS}}}function_decl'): signature = _get_signature(function) @@ -39,6 +38,29 @@ def _get_declarations(srcml): begin, end = _create_position(begin, None), _create_position(end, None) yield Function(signature=signature, span=Span(begin=begin, end=end)) +def _parse_enum(element): + enum_names = [] + + if re.search(rf"{{{SRC_NS}}}enum", element.tag): + + enum_block = element.find(rf"{{{SRC_NS}}}block") + enum_decls = enum_block.findall(rf"{{{SRC_NS}}}decl") if enum_block is not None else [] + + for decl in enum_decls: + + decl_name = _get_name_from_nested_name( + decl.find(rf"{{{SRC_NS}}}name") + ) + + decl_name_txt = ( + decl_name.text + if decl_name is not None and decl_name.text is not None + else '') + + if decl_name_txt != '': + enum_names.append(decl_name_txt) + + return list(set(enum_names)) def _get_definitions(srcml, nlines): for function in srcml.iter(f'{{{SRC_NS}}}function'): @@ -55,14 +77,12 @@ def _get_definitions(srcml, nlines): begin, end = _create_position(begin, None), _create_position(end, None) yield Function(signature=signature, span=Span(begin=begin, end=end)) - def _get_name(element): name = element.find('src:name', NS) if name is not None: name = ''.join(i.strip() for i in name.itertext()) return name - def _get_span(element): position = element.attrib[f'{{{POS_NS}}}start'] begin = (int(i) for i in position.split(':')) @@ -70,15 +90,17 @@ def _get_span(element): end = (int(i) for i in position.split(':')) return begin, end - def _get_signature(element): def _join(values, delimiter=' '): return delimiter.join(i.strip() for i in values if i.strip()) components = list() type_ = element.find('src:type', NS) - components.append(_join(type_.itertext())) - components.append(' ') + + if type_ is not None: + components.append(_join(type_.itertext())) + components.append(' ') + components.append(_get_name(element)) parameters = element.find('src:parameter_list', NS) if parameters: @@ -92,7 +114,6 @@ def _join(values, delimiter=' '): return ''.join(components) if components else None - def _get_srcml(contents, language): try: args = ['srcml', '--position', '--language', language, '-'] @@ -101,9 +122,850 @@ def _get_srcml(contents, language): ) return process.stdout except subprocess.CalledProcessError as error: - logger.exception(error) + print(error) + + return None + +def _get_name_from_nested_name(name): + if name is not None and name.text is not None: + return name + else: + next_elname = name.find(rf"{{{SRC_NS}}}name") + if next_elname is None: + return next_elname + + return _get_name_from_nested_name(next_elname) + +def _get_full_name_text_from_name(name): + name_txt = '' + + if name is not None: + if name.text is not None: + name_txt = name.text + else: + for n_txt in name.itertext(): + name_txt += n_txt + + return name_txt + +def _get_throws_expression_names(statement): + exception_names = [] + + if re.search(rf"{{{SRC_NS}}}throws", statement.tag): + args = statement.findall(rf"{{{SRC_NS}}}argument") + for arg in args: + expr = arg.find(rf"{{{SRC_NS}}}expr") + expr_name = expr.find(rf"{{{SRC_NS}}}name") if expr is not None else None + + name_txt = _get_full_name_text_from_name(expr_name) + + if name_txt != '': + exception_names.append(name_txt) + + return exception_names + +def _get_param_data(function): + parameter_list = function.find(rf"{{{SRC_NS}}}parameter_list") + parameters = parameter_list.findall(rf"{{{SRC_NS}}}parameter") + + parameter_declarations = [] + parameters_passed_by_reference = [] + + for param in parameters: + decl = param.find(rf"{{{SRC_NS}}}decl") + decl_name = decl.find(rf"{{{SRC_NS}}}name") if decl is not None else None + decl_name_txt = decl_name.text if decl_name is not None and decl_name.text else "" + + decl_type = decl.find(rf"{{{SRC_NS}}}type") if decl is not None else None + decl_type_name = decl_type.find(rf"{{{SRC_NS}}}name") if decl_type is not None else None + decl_type_name_txt = ( + decl_type_name.text + if decl_type_name is not None and decl_type_name.text + else "") + + decl_type_modifier = (decl_type.find(rf"{{{SRC_NS}}}modifier") + if decl_type is not None + else None) + + decl_type_modifier_txt = (decl_type_modifier.text + if decl_type_modifier is not None and decl_type_modifier.text + else "") + + if re.search(r"\*|\&", decl_type_modifier_txt): + parameters_passed_by_reference.append( + { + "type": decl_type_name_txt, + "modifier": decl_type_modifier_txt, + "name": decl_name_txt}) + + if decl_name_txt != "": + parameter_declarations.append( + { + "type": decl_type_name_txt, + "modifier": decl_type_modifier_txt, + "name": decl_name_txt}) + + return { + "parameters" : parameter_declarations, + "parameters_passed_by_reference": parameters_passed_by_reference} + +def _parse_function_call(element): + call_data = {} + + if re.search(rf"{{{SRC_NS}}}call", str(element)): + call_name = element.find(rf"{{{SRC_NS}}}name") + call_name_txt = _get_full_name_text_from_name(call_name) + + + if call_name_txt not in list(call_data) and call_name_txt != "": + call_data[call_name_txt] ={ + "cumulative_args": [] + } + + if call_name_txt != "": + call_arg_list = element.find(rf"{{{SRC_NS}}}argument_list") + + call_args = (call_arg_list.findall(rf"{{{SRC_NS}}}argument") + if call_arg_list is not None + else []) + + for arg in call_args: + arg_expr = arg.find(rf"{{{SRC_NS}}}expr") + arg_expr_name = (arg_expr.find(rf"{{{SRC_NS}}}name") + if arg_expr is not None + else None) + + arg_expr_name_txt = (arg_expr_name.text + if arg_expr_name is not None and arg_expr_name.text is not None + else "") + + if arg_expr_name_txt != "": + call_data[call_name_txt]["cumulative_args"] = [ + *call_data[call_name_txt]["cumulative_args"], + arg_expr_name_txt] + + for key in list(call_data): + call_data[key]["cumulative_args"] = list(set(call_data[key]["cumulative_args"])) + + if call_data != {}: + return call_data + return None +def _parse_macro_call(element, language): + macro_calls = {} + if re.search(rf"{{{SRC_NS}}}macro", element.tag): + macro_arg_list = element.find(rf"{{{SRC_NS}}}argument_list") + macro_args = (macro_arg_list.findall(rf"{{{SRC_NS}}}argument") + if macro_arg_list is not None + else None) + + if macro_args is not None: + for arg in macro_args: + arg_text = arg.text if arg is not None and arg.text is not None else "" + + if arg_text != "": + srcml = _get_srcml(arg_text, language) + rootet = ElementTree.fromstring(srcml) + + if re.search(r"{(.)+}", arg_text, flags=re.MULTILINE|re.DOTALL): + for child in rootet.iter(): + call = _parse_function_call(child) + + if call is not None: + macro_calls = {**macro_calls, **call} + + if macro_calls != {}: + return macro_calls + + return None + +def _parse_declaration( + element, + parent_struct_name = '', + parent_struct_type = '', + belongs_to_file = ''): + if re.search(rf"{{{SRC_NS}}}decl_stmt|control|struct", element.tag): + decls = [] + + if re.search(rf"{{{SRC_NS}}}control", element.tag): + control_init = element.find(rf"{{{SRC_NS}}}init") + control_init_decls = control_init.findall(rf"{{{SRC_NS}}}decl") + #print (control_init_decls) + decls = [*decls, *control_init_decls] + + if re.search(rf"{{{SRC_NS}}}struct", element.tag): + struct_decls = element.findall(rf"{{{SRC_NS}}}decl") + + decls = [*decls, *struct_decls] + + decls = [*decls, *element.findall(rf"{{{SRC_NS}}}decl")] + + for decl in decls: + decl_type = decl.find(rf"{{{SRC_NS}}}type") if decl is not None else None + + decl_names = decl.findall(rf"{{{SRC_NS}}}name") if decl is not None else None + + type_specifier = (decl_type.find(rf"{{{SRC_NS}}}specifier") + if decl_type is not None + else None) + + type_specifier_txt = (type_specifier.text + if type_specifier is not None and type_specifier.text is not None + else "") + + type_name = decl_type.find(rf"{{{SRC_NS}}}name") if decl_type is not None else None + type_name_txt = (type_name.text + if type_name is not None and type_name.text is not None + else "") + + index_tag = None + index_str = "" + + if type_name_txt == "" and type_name is not None: + i_type_name = type_name.find(rf"{{{SRC_NS}}}name") + + type_name_txt = (i_type_name.text + if i_type_name is not None and i_type_name.text is not None + else "") + + type_name_index = type_name.find(rf"{{{SRC_NS}}}index") + index_tag = type_name_index + + if type_name_index is not None: + for i_str in type_name_index.itertext(): + index_str += i_str + + type_modifier = (decl_type.find(rf"{{{SRC_NS}}}modifier") + if decl_type is not None + else None) + + type_modifier_txt = (type_modifier.text + if type_modifier is not None and type_modifier.text is not None + else "") + + decl_pos = (decl.attrib[rf"{{{POS_NS}}}start"].split(':') + if rf"{{{POS_NS}}}start" in decl.attrib.keys() + else [-1, -1]) + + decl_pos_row = int(decl_pos[0]) + + + if type_name != "": + for name in decl_names: + child_name = _get_name_from_nested_name(name) + child_name_txt = (child_name.text + if child_name is not None and child_name.text is not None + else '') + + if child_name_txt != '': + return { + "specifier": type_specifier_txt, + "type": type_name_txt, + "modifier": type_modifier_txt, + "name": child_name_txt, + "index_tag": index_tag, + "index_str": index_str, + "signature": re.sub("/s+", " ", " ".join( + [ + type_specifier_txt, + type_name_txt, + type_modifier_txt, + child_name_txt]).rstrip()), + "pos_row": decl_pos_row, + "file_name": belongs_to_file, + "parent_structure_name": parent_struct_name, + "parent_structure_type": parent_struct_type, + } + + + return None + +def _parse_el_for_global_variable_write( + element, + function_declaration_list, + parameters_passed_by_reference, + pointer_declarations, + calls, + variable_writes, + parent_declarations): + + decl_names = [d["name"] for d in [*function_declaration_list, *pointer_declarations]] + + expr_str = "" + + fan_out_var_candidates = [] + + expr_children = [child for child in element.iter()] + expr_str = ''.join([child for child in element.itertext()]) + + expr_names = element.findall(rf"{{{SRC_NS}}}name") + operators = element.findall(rf"{{{SRC_NS}}}operator") + + incr_decr_op = next( + (op for op in operators + if op is not None and op.text is not None and re.fullmatch(r"^\+\+|\-\-$", op.text)), + None) + + incr_decr_op_txt = (incr_decr_op.text + if incr_decr_op is not None and incr_decr_op.text is not None + else '') + + incr_decr_op_pos = (incr_decr_op.attrib[rf"{{{POS_NS}}}start"].split(':') + if incr_decr_op is not None and rf"{{{POS_NS}}}start" in incr_decr_op.attrib.keys() + else [-1, -1]) + + incr_decr_op_row = int(incr_decr_op_pos[0]) + incr_decr_op_col = int(incr_decr_op_pos[1]) + + equals_ops = [op for op in operators + if op is not None + and op.text is not None + and re.fullmatch(r"^\=|\+\=|\-\=|\*\=|\\\=$", op.text)] + + if len(equals_ops) == 0: + equals_ops = [None] + + last_equals_op_txt = (equals_ops[-1].text + if equals_ops[-1] is not None and equals_ops[-1].text is not None + else '') + + last_equals_op_pos = (equals_ops[-1].attrib[rf"{{{POS_NS}}}start"].split(':') + if equals_ops[-1] is not None and rf"{{{POS_NS}}}start" in equals_ops[-1].attrib.keys() + else [-1, -1]) + + last_equals_op_row = int(last_equals_op_pos[0]) + last_equals_op_col = int(last_equals_op_pos[1]) + + first_equals_op_pos = (equals_ops[0].attrib[rf"{{{POS_NS}}}start"].split(':') + if equals_ops[0] is not None + and rf"{{{POS_NS}}}start" in equals_ops[0].attrib.keys() + else [-1, -1]) + + first_equals_op_col = int(first_equals_op_pos[1]) + + if last_equals_op_txt != '' or incr_decr_op_txt != '': + if len(expr_names) > 0: + first_expr_name = expr_names[0] + first_expr_name_txt = '' + + for name in expr_names: + name_pos = name.attrib[rf"{{{POS_NS}}}start"].split(':') + name_pos_row = int(name_pos[0]) + name_pos_col = int(name_pos[1]) + + expr_sub_names = name.findall(rf"{{{SRC_NS}}}name") + + expr_sub_name = (_get_name_from_nested_name(expr_sub_names[0]) + if len(expr_sub_names) > 1 + else name) + + expr_sub_name_pos =( expr_sub_name.attrib[rf"{{{POS_NS}}}start"].split(':') + if expr_sub_name is not None + and rf"{{{POS_NS}}}start" in expr_sub_name.attrib.keys() + else [-1, -1]) + + expr_sub_name_pos_row = int(expr_sub_name_pos[0]) + expr_sub_name_pos_col = int(expr_sub_name_pos[1]) + + expr_index = name.find(rf"{{{SRC_NS}}}index") + + expr_index_pos = (expr_index.attrib[rf"{{{POS_NS}}}start"].split(':') + if expr_index is not None and rf"{{{POS_NS}}}start" in expr_index.keys() + else [-1, -1]) + + expr_index_pos_row = int(expr_index_pos[0]) + expr_index_pos_col = int(expr_index_pos[1]) + + first_expr_name_txt = (expr_sub_name.text + if expr_sub_name is not None + and expr_sub_name.text is not None + else ''.join([child_txt for child_txt in first_expr_name.itertext()])) + + name_signature = ''.join([child_txt for child_txt in name.itertext()]) + + name_op = name.findall(rf"{{{SRC_NS}}}operator") + + member_access_op = next( + (op for op in name_op + if op is not None and op.text is not None + and (op.text == '->' or op.text == '.')), None) + + member_access_op_pos = (member_access_op.attrib[rf"{{{POS_NS}}}start"].split(':') + if member_access_op is not None + and rf"{{{POS_NS}}}start" in member_access_op.attrib.keys() + else [-1, -1]) + + member_access_op_pos_row = int(member_access_op_pos[0]) + member_access_op_pos_col = int(member_access_op_pos[1]) + + members_accessed = [] + expr_mod_statements = [] + indices = [] + + index_accessed_str = '' + + if (member_access_op is not None + and member_access_op_pos_row == expr_sub_name_pos_row + and member_access_op_pos_col > expr_sub_name_pos_col + and (member_access_op_pos_col < first_equals_op_col or incr_decr_op_col != -1) + ): + + member_accessed_str = '' + + for child in expr_children: + child_pos = (child.attrib[rf"{{{POS_NS}}}start"].split(':') + if rf"{{{POS_NS}}}start" in child.attrib.keys() + else [-1, -1]) + + child_pos_row = int(child_pos[0]) + child_pos_col = int(child_pos[1]) + + child_txt = ''.join(child.itertext()) if child.text is None else child.text + + if ( + child_pos_row == member_access_op_pos_row and + child_pos_col > member_access_op_pos_col and + (child_pos_col < first_equals_op_col or incr_decr_op_col != -1)): + + if child_txt != '': + if ( + expr_index_pos_col > member_access_op_pos_col and + expr_index_pos_row == member_access_op_pos_row): + index_accessed_str += child_txt + else: + member_accessed_str += child_txt + elif ( + child_pos_col < first_equals_op_col and + expr_index_pos_col < first_equals_op_col and + expr_index_pos_col != -1): + if child_txt != '': + if expr_index_pos_row == member_access_op_pos_row: + index_accessed_str += child_txt + + if index_accessed_str != '': + indices.append(index_accessed_str) + + if member_accessed_str != '': + members_accessed.append(member_accessed_str) + + elif member_access_op is None and expr_index is None: + expr_mod_statements.append(expr_str) + + if first_expr_name_txt != "this" and first_expr_name_txt not in decl_names: + fan_out_var_candidates.append({ + "name": first_expr_name_txt, + "signature": name_signature, + "row_pos": name_pos_row, + "col_pos": name_pos_col, + "members_accessed": members_accessed, + "indices" : indices, + "expr_mod_statements": expr_mod_statements + }) + + for cand in fan_out_var_candidates: + if ( + last_equals_op_txt != '' and + last_equals_op_col > cand["col_pos"] and + last_equals_op_row == cand["row_pos"]): + if cand["name"] not in variable_writes.keys(): + variable_writes[cand["name"]] = { + 'expressions': cand["expr_mod_statements"], + 'members_modified': cand["members_accessed"], + 'indices_modified': cand["indices"] + } + else: + variable_writes[cand["name"]]['expressions'] = [ + *variable_writes[cand["name"]]['expressions'], + *cand["expr_mod_statements"]] + + variable_writes[cand["name"]]['members_modified'] = [ + *variable_writes[cand["name"]]['members_modified'], + *cand["members_accessed"]] + + variable_writes[cand["name"]]['indices_modified'] = [ + *variable_writes[cand["name"]]['indices_modified'], + *cand["indices"]] + + elif incr_decr_op_txt and incr_decr_op_row == cand["row_pos"]: + if cand["name"] not in variable_writes.keys(): + variable_writes[cand["name"]] = { + 'expressions': cand["expr_mod_statements"], + 'members_modified': cand["members_accessed"], + 'indices_modified': cand["indices"] + } + else: + variable_writes[cand["name"]]['expressions'] = [ + *variable_writes[cand["name"]]['expressions'], + *cand["expr_mod_statements"]] + + variable_writes[cand["name"]]['members_modified'] = [ + *variable_writes[cand["name"]]['members_modified'], + *cand["members_accessed"]] + + variable_writes[cand["name"]]['indices_modified'] = [ + *variable_writes[cand["name"]]['indices_modified'], + *cand["indices"]] + +def _parse_el_for_global_variable_read( + expr, + calls, + function_declarations, + pointer_declarations, + params, + local_function_names, + enums, + read_variable_names, + function_throws_exception_names, + parent_declarations): + declaration_names = [d["name"] for d in function_declarations] + parent_declaration_var_names= [d["name"] for d in parent_declarations if d is not None] + param_names = [p["name"] for p in params] + + call_arg_names = [] + + for key in calls.keys(): + call_arg_names = [*call_arg_names, *calls[key]["cumulative_args"]] + + expr_names = expr.findall(rf"{{{SRC_NS}}}name") + + ops = expr.findall(rf"{{{SRC_NS}}}operator") if expr is not None else None + + last_op = next( + (op for op in list(reversed(ops)) + if op is not None and + op.text is not None and + re.fullmatch(r"^\=|\+\=|\-\=|\*\=|\\\=$", op.text)), None) + + incr_decr_op = next(( + op for op in ops if op is not None and + op.text is not None and + re.fullmatch(r"^\+\+|\-\-$", op.text)), None) + + incr_decr_op_pos = (incr_decr_op.attrib[rf"{{{POS_NS}}}start"].split(':') + if incr_decr_op is not None and + rf"{{{POS_NS}}}start" in incr_decr_op.attrib.keys() + else [-1, -1]) + + incr_decr_op_col = int(incr_decr_op_pos[1]) + + equal_op_pos = ( + last_op.attrib[rf'{{{POS_NS}}}start'].split(':') + if last_op is not None and + rf'{{{POS_NS}}}start' in last_op.attrib.keys() + else [-1, -1]) + + equal_op_pos_col = int(equal_op_pos[1]) + + for arg in call_arg_names: + if( + not isinstance(arg, (int, float, bytes)) and + arg != "" and + arg is not None and + arg not in C_RESERVED_KEYWORDS and + arg not in C_LIB_STREAMS and + not re.match(r"^null$", arg, flags=re.IGNORECASE) and + arg not in declaration_names and + arg not in param_names and + ( + + ( + arg not in list(calls) and + arg not in C_LIB_FUNCTIONS and + arg not in local_function_names and + arg not in enums and + arg not in function_throws_exception_names + ) + or + arg in parent_declaration_var_names + or + arg in param_names + ) + ): + read_variable_names.append(arg) + + for name in expr_names: + name_txt = _get_full_name_text_from_name(name) + + name_pos = ( + name.attrib[rf'{{{POS_NS}}}start'].split(':') + if rf'{{{POS_NS}}}start' in name.attrib.keys() + else [-1, -1]) + + name_pos_col = int(name_pos[1]) + + name_member_access_txt = re.split(r"\-\>|\[|\.", name_txt, 1)[0] + + if( + name_pos_col >= equal_op_pos_col and + equal_op_pos_col <= incr_decr_op_col and + name_member_access_txt != "" and + name_member_access_txt is not None and + name_member_access_txt not in C_RESERVED_KEYWORDS and + name_member_access_txt not in C_LIB_STREAMS and + not re.match(r"^null$", name_member_access_txt, flags=re.IGNORECASE) and + name_member_access_txt not in declaration_names and + ( + ( + name_member_access_txt not in list(calls) and + name_member_access_txt not in C_LIB_FUNCTIONS and + name_member_access_txt not in local_function_names and + name_member_access_txt not in enums and + name_member_access_txt not in function_throws_exception_names + ) + or + name_member_access_txt in parent_declaration_var_names + or + name_member_access_txt in param_names + ) + ): + read_variable_names.append(name_txt) + + read_variable_names = list(set([*read_variable_names])) + +def _compile_acyclical_paths_tree(root): + root_paths = [] + + root_block = root.find(rf"{{{SRC_NS}}}block") + root_block_content = ( + root_block.find(rf"{{{SRC_NS}}}block_content") + if root_block is not None + else root_block if root_block is not None + else root) + + for child in list(root_block_content): + if re.search(rf'{{{SRC_NS}}}if_stmt', child.tag): + root_paths = [ + *root_paths, + *_compile_acyclical_paths_tree(child)] + + elif re.search (rf'{{{SRC_NS}}}if|else', child.tag): + if_type = child.attrib["type"] if "type" in child.attrib.keys() else "" + + root_paths.append({ + "type": child.tag, + "if_type": if_type, + "children": _compile_acyclical_paths_tree(child) + }) + elif re.search(rf'{{{SRC_NS}}}for|while|do', child.tag): + root_paths.append({ + "type": child.tag, + "children": _compile_acyclical_paths_tree(child) + }) + elif re.search(rf"{{{SRC_NS}}}switch", child.tag): + root_paths.append({ + "type": child.tag, + "children": _compile_acyclical_paths_tree(child) + }) + elif re.search(rf'{{{SRC_NS}}}case|default', child.tag): + root_paths.append({ + "type": child.tag, + "children": _compile_acyclical_paths_tree(child) + }) + elif re.search(rf"{{{SRC_NS}}}ternary", child.tag): + root_paths.append({ + "type": child.tag, + "children": _compile_acyclical_paths_tree(child) + }) + elif re.search(rf"{{{SRC_NS}}}then", child.tag): + root_paths.append({ + "type": child.tag, + "children": [] + }) + + return root_paths + +def get_function_global_var_ops_and_paths( + function_element, + function_dict, + all_local_call_names, + parent_struct_name, + parent_struct_type, + parent_declarations, + file_name, + enums, + local_function_names, + language): + + if re.search(rf"{{{SRC_NS}}}function|constructor", function_element.tag): + func_sig =_get_signature(function_element) + func_name = _get_name(function_element) + block = function_element.find(rf"{{{SRC_NS}}}block") + + has_return_value = False + + acyc_paths = _compile_acyclical_paths_tree(function_element) + + throws_exception_names = [] + declarations = [] + pointer_decls = [] + + calls = {} + macro_calls = {} + + global_variable_writes = {} + global_variable_reads = [] + + if block is not None: + param_data = _get_param_data(function_element) + param_count = len(param_data["parameters"]) + + for func_child in function_element.iter(): + decl = _parse_declaration( + func_child, + parent_struct_name=parent_struct_name, + parent_struct_type=parent_struct_type, + belongs_to_file=file_name) + + call = _parse_function_call(func_child) + macros = _parse_macro_call(func_child, language) + throws = _get_throws_expression_names(func_child) + + if throws != []: + throws_exception_names = [*throws_exception_names, *throws] + + if decl is not None: + if decl["modifier"] == "*": + pointer_decls.append(decl) + else: + declarations.append(decl) + + if call is not None: + calls = {**calls, **call} + all_local_call_names = [*all_local_call_names, *call.keys()] + + if macros is not None: + macro_calls = {**macro_calls, **macros} + + if re.search(rf'{{{SRC_NS}}}return', func_child.tag) and has_return_value is False: + return_expr = func_child.find(rf"{{{SRC_NS}}}expr") + if return_expr is not None: + has_return_value = True + + if re.search(rf"{{{SRC_NS}}}expr|decl_stmt", func_child.tag): + _parse_el_for_global_variable_write( + element = func_child, + function_declaration_list = declarations, + parameters_passed_by_reference = param_data[ + "parameters_passed_by_reference"], + pointer_declarations = pointer_decls, + calls = calls, + variable_writes = global_variable_writes, + parent_declarations = parent_declarations + ) + + _parse_el_for_global_variable_read( + expr = func_child, + calls = calls, + function_declarations = declarations, + pointer_declarations = pointer_decls, + params = param_data["parameters"], + local_function_names = local_function_names, + enums = enums, + read_variable_names = global_variable_reads, + function_throws_exception_names = throws_exception_names, + parent_declarations = parent_declarations + ) + + global_variable_reads = list(set(global_variable_reads)) + + if func_sig not in function_dict.keys(): + local_function_names.append(func_name) + function_dict[func_sig] = { + "signature": func_sig, + "function_name": func_name, + "param_count": param_count, + "calls": calls, + "functions_called_by": [], + "acyclical_paths_tree": acyc_paths, + "has_return": has_return_value, + "parent_structure_name": parent_struct_name, + "parent_structure_type": parent_struct_type, + "file_name": file_name, + "global_variable_writes": global_variable_writes, + "global_variable_reads": list(set(global_variable_reads)) + } + + return function_dict + +def get_functions_with_metric_properties( + root_element, + parent_struct_name, + parent_struct_type, + parent_declarations, + file_name, + local_function_names, + enums, + all_local_call_names, + language + ): + parent_name_txt = parent_struct_name + local_declarations = parent_declarations + + function_dict = {} + for child in list(root_element): + if re.search(rf"{{{SRC_NS}}}class|struct|namespace|unit", child.tag): + parent_name = child.find(rf"{{{SRC_NS}}}name") + new_parent_struct_type = re.sub(r"{.+}", "", child.tag) + new_parent_name_txt = parent_struct_name + _get_full_name_text_from_name(parent_name) + + class_declarations = [ + _parse_declaration( + element = decl, + parent_struct_name = new_parent_name_txt, + parent_struct_type = new_parent_struct_type, + belongs_to_file = file_name) + for decl in child.findall(rf"{{{SRC_NS}}}decl_stmt")] + + class_enums = [ + _parse_enum(el) + for el in child.findall(rf'{{{SRC_NS}}}enum') + ] + + local_declarations = [*parent_declarations, *class_declarations] + enums = [*enums, *class_enums] + + function_dict = {**function_dict, + **get_functions_with_metric_properties( + root_element = child, + all_local_call_names = all_local_call_names, + parent_struct_name = new_parent_name_txt, + parent_struct_type = new_parent_struct_type, + parent_declarations = local_declarations, + file_name = file_name, + local_function_names=[f["function_name"] for f in function_dict.values()], + enums = enums, + language = language)} + + if re.search(rf"{{{SRC_NS}}}block|block_content", child.tag): + function_dict = {**function_dict, + **get_functions_with_metric_properties( + root_element = child, + all_local_call_names = all_local_call_names, + parent_struct_name = parent_name_txt, + parent_struct_type = parent_struct_type, + parent_declarations = local_declarations, + file_name = file_name, + local_function_names=[f["function_name"] for f in function_dict.values()], + enums = enums, + language = language)} + + if re.search(rf"{{{SRC_NS}}}function|constructor", child.tag): + updated_function_dict = get_function_global_var_ops_and_paths( + function_element = child, + function_dict = function_dict, + all_local_call_names = all_local_call_names, + parent_struct_name = parent_name_txt, + parent_struct_type = parent_struct_type, + parent_declarations = parent_declarations, + file_name = file_name, + local_function_names=[f["function_name"] for f in function_dict.values()], + enums = enums, + language = language) + + function_dict = {**function_dict, **updated_function_dict} + + return function_dict class SrcMLParser: def __init__(self, language): @@ -136,3 +998,40 @@ def get_functions(self, name, contents): functions.extend(_get_definitions(srcml, nlines)) return functions + + def get_functions_with_properties(self, file_name, contents): + srcml = _get_srcml(contents, self._language) + + if srcml is None: + logger.debug('Srcml parser is none') + return "None" + else: + logger.debug('Successfully retrieved srcml parser') + + root = ElementTree.fromstring(srcml) + + root_declarations = [ + _parse_declaration( + element = decl, + parent_struct_name=file_name, + parent_struct_type='file', + belongs_to_file='file_name') + for decl in root.findall(rf'{{{SRC_NS}}}decl_stmt')] + + root_enums = [ + _parse_enum(el) + for el in root.findall(rf'{{{SRC_NS}}}enum') + ] + + func_dict = get_functions_with_metric_properties( + root_element = root, + parent_struct_name = file_name, + parent_struct_type= 'file', + parent_declarations=root_declarations, + all_local_call_names=[], + local_function_names=[], + language=self._language, + enums = root_enums, + file_name = file_name) + + return func_dict diff --git a/parser/parser/schemas.py b/parser/parser/schemas.py index 8ec2edc..15375a2 100644 --- a/parser/parser/schemas.py +++ b/parser/parser/schemas.py @@ -1,4 +1,4 @@ -from marshmallow import Schema, fields, post_load +from marshmallow import Schema, fields, post_load, EXCLUDE from .models import Comment, Function, Position, Span @@ -29,11 +29,63 @@ class CommentSchema(Schema): def make_comment(self, data, **kwargs): return Comment(**data) - class FunctionSchema(Schema): signature = fields.String() span = fields.Nested(SpanSchema) + class Meta: + unknown = EXCLUDE + + param_count = fields.Integer( + default = 0, + allow_none = True + ) + + calls = fields.List( + fields.String(), + default = [], + allow_none=True + ) + + functions_called_by = fields.List( + fields.String(), + default = [], + allow_none=True + ) + + acyclical_paths_tree = fields.List( + fields.Dict(), + allow_none=True + ) + + has_return = fields.Boolean( + default = False, + allow_none=True + ) + + parent_structure_name = fields.String(allow_none=True) + parent_structure_type = fields.String(allow_none=True) + file_name = fields.String(allow_none=True) + + global_variable_writes = fields.Dict( + keys = fields.String(), + values = fields.Dict( + keys = fields.String(), + values = fields.List( + fields.String, + default = [], + allow_none = True + ) + ), + allow_none = True + ) + + global_variable_reads = fields.List( + fields.String(), + allow_none=True + ) + + @post_load def make_function(self, data, **kwargs): return Function(**data) diff --git a/parser/parser/service.py b/parser/parser/service.py index 704f3de..5124016 100644 --- a/parser/parser/service.py +++ b/parser/parser/service.py @@ -2,6 +2,7 @@ from nameko.dependency_providers import Config from nameko.rpc import rpc +from nameko.testing.services import worker_factory from . import utilities from .languages import get_languages @@ -56,3 +57,31 @@ def get_functions(self, name, contents): functions = FunctionSchema(many=True).dump(functions) return functions + + @rpc + def get_language(self, name): + return self.inferer.infer(name) + + @rpc + def get_functions_with_properties(self, name, contents): + functions = None + function_list = [] + + language = self.inferer.infer(name) + if language is None: + logger.debug('%s is an unsupported language', name) + else: + parser = get_parser(language) + functions = parser.get_functions_with_properties(name, contents) + + if functions is not None and functions != {}: + for key in functions.keys(): + function_list.append( + FunctionSchema(many=False) + .dump(functions[key]) + ) + + functions = function_list + + + return functions From e122fc6b0ca8825f0fa00b062d8f80132bb19dd0 Mon Sep 17 00:00:00 2001 From: Brandon Date: Fri, 4 Jun 2021 14:07:54 -0400 Subject: [PATCH 2/4] Removed an unimportant comment --- parser/parser/parsers/srcmlparser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/parser/parser/parsers/srcmlparser.py b/parser/parser/parsers/srcmlparser.py index b153b83..a1dd188 100644 --- a/parser/parser/parsers/srcmlparser.py +++ b/parser/parser/parsers/srcmlparser.py @@ -291,7 +291,6 @@ def _parse_declaration( if re.search(rf"{{{SRC_NS}}}control", element.tag): control_init = element.find(rf"{{{SRC_NS}}}init") control_init_decls = control_init.findall(rf"{{{SRC_NS}}}decl") - #print (control_init_decls) decls = [*decls, *control_init_decls] if re.search(rf"{{{SRC_NS}}}struct", element.tag): From 615dbcf4e706373c84eab85c22fc46665705ac27 Mon Sep 17 00:00:00 2001 From: Brandon Date: Thu, 17 Jun 2021 16:40:35 -0400 Subject: [PATCH 3/4] Implement pull request changes from reviewer This commit implements the changes suggested by the reviewer, Nuthan Munaiah, in the pull request for this branch to merge into the development branch. This mainly consists of minor improvements in code efficiency, sources of constant values, and a stronger enforcecment of pylint rules. The exact improvements can be seen in the conversations had on the parser branch pull request. --- .../functionchurn/schemas/parser.py | 5 +- metrics/loc/loc/schemas/parser.py | 5 +- parser/parser/constants.py | 224 ++- parser/parser/parsers/__init__.py | 2 +- parser/parser/parsers/srcmlparser.py | 1413 +++++++++-------- parser/parser/schemas.py | 5 +- parser/parser/service.py | 2 - 7 files changed, 900 insertions(+), 756 deletions(-) diff --git a/metrics/functionchurn/functionchurn/schemas/parser.py b/metrics/functionchurn/functionchurn/schemas/parser.py index 547a701..cea3935 100644 --- a/metrics/functionchurn/functionchurn/schemas/parser.py +++ b/metrics/functionchurn/functionchurn/schemas/parser.py @@ -1,4 +1,4 @@ -from marshmallow import Schema, fields, post_load, EXCLUDE +from marshmallow import Schema, fields, post_load from ..models import Comment, Function, Position, Span @@ -34,9 +34,6 @@ class FunctionSchema(Schema): signature = fields.String() span = fields.Nested(SpanSchema) - class Meta: - unknown = EXCLUDE - @post_load def make_function(self, data, **kwargs): return Function(**data) diff --git a/metrics/loc/loc/schemas/parser.py b/metrics/loc/loc/schemas/parser.py index 547a701..cea3935 100644 --- a/metrics/loc/loc/schemas/parser.py +++ b/metrics/loc/loc/schemas/parser.py @@ -1,4 +1,4 @@ -from marshmallow import Schema, fields, post_load, EXCLUDE +from marshmallow import Schema, fields, post_load from ..models import Comment, Function, Position, Span @@ -34,9 +34,6 @@ class FunctionSchema(Schema): signature = fields.String() span = fields.Nested(SpanSchema) - class Meta: - unknown = EXCLUDE - @post_load def make_function(self, data, **kwargs): return Function(**data) diff --git a/parser/parser/constants.py b/parser/parser/constants.py index 042b3cf..4da7240 100644 --- a/parser/parser/constants.py +++ b/parser/parser/constants.py @@ -1,69 +1,157 @@ -C_LIB_FUNCTIONS = [ - "abort","abs","acos","asctime","asctime_r", - "asin","assert","atan","atan2","atexit", - "atof","atoi","atol","bsearch","btowc", - "calloc","catclose","catgets","catopen","ceil", - "clearerr","clock","cos","cosh","ctime", - "ctime64","ctime_r","ctime64_r","difftime","difftime64", - "div","erf","erfc","exit","exp", - "fabs","fclose","fdopen","feof","ferror", - "fflush","fgetc","fgetpos","fgets","fgetwc", - "fgetws","fileno","floor","fmod","fopen", - "fprintf","fputc","fputs","fputwc","fputws", - "fread","free","freopen","frexp","fscanf", - "fseek","fsetpos","ftell","fwide","fwprintf", - "fwrite","fwscanf","gamma","getc","getchar", - "getenv","gets","getwc","getwchar","gmtime", - "gmtime64","gmtime_r","gmtime64_r","hypot","isalnum", - "isalpha","isascii","isblank","iscntrl","isdigit", - "isgraph","islower","isprint","ispunct","isspace", - "isupper","iswalnum","iswalpha","iswblank","iswcntrl", - "iswctype","iswdigit","iswgraph","iswlower","iswprint", - "iswpunct","iswspace","iswupper","iswxdigit","isxdigit", - "j0","j1","jn","labs","ldexp", - "ldiv","localeconv","localtime","localtime64","localtime_r", - "localtime64_r","log","log10","longjmp","malloc", - "mblen","mbrlen","mbrtowc","mbsinit","mbsrtowcs", - "mbstowcs","mbtowc","memchr","memcmp","memcpy", - "memmove","memset","mktime","mktime64","modf", - "nextafter","nextafterl","nexttoward","nexttowardl","nl_langinfo", - "perror","pow","printf","putc","putchar", - "putenv","puts","putwc","putwchar","qsort", - "quantexpd32","quantexpd64","quantexpd128","quantized32","quantized64", - "quantized128","samequantumd32","samequantumd64","samequantumd128","raise", - "rand","rand_r","realloc","regcomp","regerror","regexec", - "regfree","remove","rename","rewind","scanf", - "setbuf","setjmp","setlocale","setvbuf","signal", - "sin","sinh","snprintf","sprintf","sqrt", - "srand","sscanf","strcasecmp","strcat","strchr", - "strcmp","strcoll","strcpy","strcspn","strerror", - "strfmon","strftime","strlen","strncasecmp","strncat", - "strncmp","strncpy","strpbrk","strptime","strrchr", - "strspn","strstr","strtod","strtod32","strtod64", - "strtod128","strtof","strtok","strtok_r","strtol", - "strtold","strtoul","strxfrm","swprintf","swscanf", - "system","tan","tanh","time","time64", - "tmpfile","tmpnam","toascii","tolower","toupper", - "towctrans","towlower","towupper","ungetc","ungetwc", - "va_arg","va_copy","va_end","va_start", - "vfprintf","vfscanf","vfwprintf","vfwscanf", - "vprintf","vscanf","vsprintf","vsnprintf", - "vsscanf","vswprintf","vswscanf","vwprintf","vwscanf", - "wcrtomb","wcscat","wcschr","wcscmp","wcscoll", - "wcscpy","wcscspn","wcsftime","wcslen","wcslocaleconv", - "wcsncat","wcsncmp","wcsncpy","wcspbrk","wcsptime", - "wcsrchr","char *ctime64_r(const time64_t *time, char *buf);","wcsspn","wcsstr","wcstod", - "wcstod32","wcstod64","wcstod128","wcstof","wcstok", - "wcstol","wcstold","wcstombs","wcstoul","wcsxfrm", - "wctob","wctomb","wctrans","wctype","wcwidth", - "wmemchr","wmemcmp","wmemcpy","wmemmove","wmemset", - "wprintf","wscanf","y0","y1","yn"] +""" +From: +https://www.ibm.com/docs/en/i/7.3?topic=extensions-standard-c-library-functions-table-by-name +https://www.cplusplus.com/reference/iolibrary/#:~:text=A%20stream%20is%20an%20abstraction,and%20ouput%20operations%20are%20performed.&text=For%20example%2C%20file%20streams%20are,physically%20reflected%20in%20the%20file. +https://www.cprogramming.com/function.html -C_LIB_STREAMS = ["stderr", "stdout"] -C_RESERVED_KEYWORDS = [ - "auto", "const", "int", "short", "struct", - "unsigned", "double", "float", "break", "continue", - "long", "signed", "switch", "void", "else", "for", "case", - "default", "register", "sizeof", "typedef", "volatile", - "enum", "goto", "char", "do", "return", "static", - "union", "while", "extern", "if"] \ No newline at end of file +It's necessary to keep track of the names of built in functions/streams/reservered keywords so that the +algorithms in the srcmlparser FunctionCollector class doesn't confuse the names of certain srcml elements +with variable names in expression tags that are being parsed and collected. If they do get confused, it could +reduce the accuracy of the flow metric. +""" + +C_LIB_FUNCTIONS = ('abort', 'abs', 'acos', 'asctime', +'asctime_r', 'asin', 'assert', 'atan', +'atan2', 'atexit', 'atof', 'atoi', +'atol', 'bsearch', 'btowc', 'calloc', +'catclose', 'catgets', 'catopen', 'ceil', +'clearerr', 'clock', 'cos', 'cosh', +'ctime', 'ctime64', 'ctime_r', 'ctime64_r', +'difftime', 'difftime64', 'div', 'erf', +'erfc', 'exit', 'exp', 'fabs', +'fclose', 'fdopen', 'feof', 'ferror', +'fflush', 'fgetc', 'fgetpos', 'fgets', +'fgetwc', 'fgetws', 'fileno', 'floor', +'fmod', 'fopen', 'fprintf', 'fputc', +'fputs', 'fputwc', 'fputws', 'fread', +'free', 'freopen', 'frexp', 'fscanf', +'fseek', 'fsetpos', 'ftell', 'fwide', +'fwprintf', 'fwrite', 'fwscanf', 'gamma', +'getc', 'getchar', 'getenv', 'gets', +'getwc', 'getwchar', 'gmtime', 'gmtime64', +'gmtime_r', 'gmtime64_r', 'hypot', 'isalnum', +'isalpha', 'isascii', 'isblank', 'iscntrl', +'isdigit', 'isgraph', 'islower', 'isprint', +'ispunct', 'isspace', 'isupper', 'iswalnum', +'iswalpha', 'iswblank', 'iswcntrl', 'iswctype', +'iswdigit', 'iswgraph', 'iswlower', 'iswprint', +'iswpunct', 'iswspace', 'iswupper', 'iswxdigit', +'isxdigit', 'j0', 'j1', 'jn', +'labs', 'ldexp', 'ldiv', 'localeconv', +'localtime', 'localtime64', 'localtime_r', 'localtime64_r', +'log', 'log10', 'longjmp', 'malloc', +'mblen', 'mbrlen', 'mbrtowc', 'mbsinit', +'mbsrtowcs', 'mbstowcs', 'mbtowc', 'memchr', +'memcmp', 'memcpy', 'memmove', 'memset', +'mktime', 'mktime64', 'modf', 'nextafter', +'nextafterl', 'nexttoward', 'nexttowardl', 'nl_langinfo', +'perror', 'pow', 'printf', 'putc', +'putchar', 'putenv', 'puts', 'putwc', +'putwchar', 'qsort', 'quantexpd32', 'quantexpd64', +'quantexpd128', 'quantized32', 'quantized64', 'quantized128', +'samequantumd32', 'samequantumd64', 'samequantumd128', 'raise', +'rand', 'rand_r', 'realloc', 'regcomp', +'regerror', 'regexec', 'regfree', 'remove', +'rename', 'rewind', 'scanf', 'setbuf', +'setjmp', 'setlocale', 'setvbuf', 'signal', +'sin', 'sinh', 'snprintf', 'sprintf', +'sqrt', 'srand', 'sscanf', 'strcasecmp', +'strcat', 'strchr', 'strcmp', 'strcoll', +'strcpy', 'strcspn', 'strerror', 'strfmon', +'strftime', 'strlen', 'strncasecmp', 'strncat', +'strncmp', 'strncpy', 'strpbrk', 'strptime', +'strrchr', 'strspn', 'strstr', 'strtod', +'strtod32', 'strtod64', 'strtod128', 'strtof', +'strtok', 'strtok_r', 'strtol', 'strtold', +'strtoul', 'strxfrm', 'swprintf', 'swscanf', +'system', 'tan', 'tanh', 'time', +'time64', 'tmpfile', 'tmpnam', 'toascii', +'tolower', 'toupper', 'towctrans', 'towlower', +'towupper', 'ungetc', 'ungetwc', 'va_arg', +'va_copy', 'va_end', 'va_start', 'vfprintf', +'vfscanf','vfwprintf', 'vfwscanf', 'vprintf', +'vscanf', 'vsprintf', 'vsnprintf', 'vsscanf', +'vswprintf', 'vswscanf', 'vwprintf', 'vwscanf', +'wcrtomb', 'wcscat', 'wcschr', 'wcscmp', +'wcscoll', 'wcscpy', 'wcscspn', 'wcsftime', +'wcslen', 'wcslocaleconv', 'wcsncat', 'wcsncmp', +'wcsncpy', 'wcspbrk', 'wcsptime', 'wcsrchr', +'char *ctime64_r(const time64_t *time, char *buf);', +'wcsspn', 'wcsstr', 'wcstod', 'wcstod32', +'wcstod64', 'wcstod128', 'wcstof', 'wcstok', 'wcstol', +'wcstold', 'wcstombs', 'wcstoul', 'wcsxfrm', +'wctob', 'wctomb', 'wctrans', 'wctype', +'wcwidth', 'wmemchr', 'wmemcmp', 'wmemcpy', +'wmemmove', 'wmemset', 'wprintf','wscanf', +'y0', 'y1', 'yn') + +C_LIB_STREAMS = ("stderr", "stdout") + +C_RESERVED_KEYWORDS = ( + "auto", "const", "int", "short", "struct", + "unsigned", "double", "float", "break", "continue", + "long", "signed", "switch", "void", "else", "for", "case", + "default", "register", "sizeof", "typedef", "volatile", + "enum", "goto", "char", "do", "return", "static", + "union", "while", "extern", "if") + +C_PLUS_PLUS_STDLIB_FUNCS = ( +'abort', 'abs' ,'acos', 'asin', +'atan', 'atexit', 'atof', 'atoi', +'atol', 'ceil', 'clock', 'cosh', +'ctime', 'div', 'exit', 'fabs', +'floor', 'fmod', 'getchar', 'getenv', +'isalnum', 'isalpha', 'isdigit', 'isgraph', +'ispunct', 'isspace', 'isupper', 'kbhit', +'log10', 'log2', 'log', 'memcmp', +'modf', 'pow', 'putchar', 'putenv', +'puts', 'rand', 'remove', 'rename', +'sinh', 'sqrt', 'srand', 'strcat', +'strcmp', 'strerror', 'time', 'tolower', +'toupper', +) + +C_PLUS_PLUS_RESERVED_KEYWORDS = ( + 'alignas', 'alignof', 'and', 'and_eq', + 'asm', 'atomic_cancel', 'atomic_commit', 'atomic_noexcept', + 'auto', 'bitand', 'bitor', 'bool', + 'break', 'case', 'catch', 'char', + 'char8_t', 'char16_t', 'char32_t', 'class', + 'compl', 'concept', 'const', 'consteval', + 'constexpr', 'constinit', 'const_cast', 'continue', + 'co_await', 'co_return', 'co_yield', 'decltype', + 'default', 'delete', 'do', 'double', + 'dynamic_cast', 'else', 'enum', 'explicit', + 'export', 'extern', 'false', 'float', + 'for', 'friend', 'goto', 'if', + 'inline', 'int', 'long', 'mutable', + 'namespace', 'new', 'noexcept', 'not', + 'not_eq', 'nullptr', 'operator', 'or', + 'or_eq', 'private', 'protected', 'public', + 'reflexpr', 'register', 'reinterpret_cast', 'requires', + 'return', 'short', 'signed', 'sizeof', + 'static', 'static_assert', 'static_cast', 'struct', + 'switch', 'synchronized', 'template', 'this', + 'thread_local', 'throw', 'true', 'try', + 'typedef', 'typeid', 'typename', 'union', + 'unsigned', 'using (1)', 'virtual', 'void', + 'volatile', 'wchar_t', 'while', 'xor', + 'xor_eq') + +C_PLUS_PLUS_STREAMS = ( +'cin', 'cout', 'cerr', 'clog' +) + +def _get_constants_from_language(language): + if language == 'C': + return ( + C_LIB_FUNCTIONS, + C_LIB_STREAMS, + C_RESERVED_KEYWORDS) + elif language == 'C++': + return ( + C_PLUS_PLUS_STDLIB_FUNCS, + C_PLUS_PLUS_STREAMS, + C_PLUS_PLUS_RESERVED_KEYWORDS) + + return (), (), () diff --git a/parser/parser/parsers/__init__.py b/parser/parser/parsers/__init__.py index 2933f5e..b83365e 100644 --- a/parser/parser/parsers/__init__.py +++ b/parser/parser/parsers/__init__.py @@ -1,7 +1,7 @@ from .srcmlparser import SrcMLParser from ..exceptions import NoParser -PARSERS = {'C': SrcMLParser, 'C++': SrcMLParser, 'Java': SrcMLParser} +PARSERS = {'C': SrcMLParser, 'C++': SrcMLParser} def get_parser(language): diff --git a/parser/parser/parsers/srcmlparser.py b/parser/parser/parsers/srcmlparser.py index a1dd188..327241f 100644 --- a/parser/parser/parsers/srcmlparser.py +++ b/parser/parser/parsers/srcmlparser.py @@ -6,7 +6,7 @@ from ..enumerations import CommentType from ..models import Comment, Function, Position, Span -from ..constants import C_LIB_FUNCTIONS, C_LIB_STREAMS, C_RESERVED_KEYWORDS +from ..constants import _get_constants_from_language logger = logging.getLogger(__name__) @@ -41,16 +41,15 @@ def _get_declarations(srcml): def _parse_enum(element): enum_names = [] - if re.search(rf"{{{SRC_NS}}}enum", element.tag): + if element.tag == f'{{{SRC_NS}}}enum': - enum_block = element.find(rf"{{{SRC_NS}}}block") - enum_decls = enum_block.findall(rf"{{{SRC_NS}}}decl") if enum_block is not None else [] + enum_block = element.find(f'{{{SRC_NS}}}block') + enum_decls = (enum_block.findall(f'{{{SRC_NS}}}decl') + if enum_block is not None else []) for decl in enum_decls: - decl_name = _get_name_from_nested_name( - decl.find(rf"{{{SRC_NS}}}name") - ) + decl.find(f'{{{SRC_NS}}}name')) decl_name_txt = ( decl_name.text @@ -84,11 +83,14 @@ def _get_name(element): return name def _get_span(element): - position = element.attrib[f'{{{POS_NS}}}start'] - begin = (int(i) for i in position.split(':')) - position = element.attrib[f'{{{POS_NS}}}end'] - end = (int(i) for i in position.split(':')) - return begin, end + if element is not None: + position = element.attrib[f'{{{POS_NS}}}start'] + begin = (int(i) for i in position.split(':')) + position = element.attrib[f'{{{POS_NS}}}end'] + end = (int(i) for i in position.split(':')) + + return begin, end + return (-1, -1), (-1, -1) def _get_signature(element): def _join(values, delimiter=' '): @@ -122,19 +124,15 @@ def _get_srcml(contents, language): ) return process.stdout except subprocess.CalledProcessError as error: - print(error) + logger.exception(error) return None def _get_name_from_nested_name(name): - if name is not None and name.text is not None: - return name - else: - next_elname = name.find(rf"{{{SRC_NS}}}name") - if next_elname is None: - return next_elname - - return _get_name_from_nested_name(next_elname) + curr, prev = name, None + while curr is not None: + prev, curr = curr, curr.find(f'{{{SRC_NS}}}name') + return prev def _get_full_name_text_from_name(name): name_txt = '' @@ -143,19 +141,19 @@ def _get_full_name_text_from_name(name): if name.text is not None: name_txt = name.text else: - for n_txt in name.itertext(): - name_txt += n_txt + name_txt = ''.join(n_txt for n_txt in name.itertext()) return name_txt -def _get_throws_expression_names(statement): +def _get_throws_expression_names(element): exception_names = [] - if re.search(rf"{{{SRC_NS}}}throws", statement.tag): - args = statement.findall(rf"{{{SRC_NS}}}argument") + if element.tag == f'{{{SRC_NS}}}throws': + args = element.findall(f'{{{SRC_NS}}}argument') for arg in args: - expr = arg.find(rf"{{{SRC_NS}}}expr") - expr_name = expr.find(rf"{{{SRC_NS}}}name") if expr is not None else None + expr = arg.find(f'{{{SRC_NS}}}expr') + expr_name = (expr.find(f'{{{SRC_NS}}}name') + if expr is not None else None) name_txt = _get_full_name_text_from_name(expr_name) @@ -165,87 +163,95 @@ def _get_throws_expression_names(statement): return exception_names def _get_param_data(function): - parameter_list = function.find(rf"{{{SRC_NS}}}parameter_list") - parameters = parameter_list.findall(rf"{{{SRC_NS}}}parameter") + parameter_list = function.find(f'{{{SRC_NS}}}parameter_list') + parameters = parameter_list.findall(f'{{{SRC_NS}}}parameter') parameter_declarations = [] parameters_passed_by_reference = [] for param in parameters: - decl = param.find(rf"{{{SRC_NS}}}decl") - decl_name = decl.find(rf"{{{SRC_NS}}}name") if decl is not None else None - decl_name_txt = decl_name.text if decl_name is not None and decl_name.text else "" + decl = param.find(f'{{{SRC_NS}}}decl') + + decl_name = (decl.find(f'{{{SRC_NS}}}name') + if decl is not None else None) + + decl_name_txt = (decl_name.text + if decl_name is not None and decl_name.text else '') + + decl_type = (decl.find(f'{{{SRC_NS}}}type') + if decl is not None else None) + + decl_type_name = (decl_type.find(f'{{{SRC_NS}}}name') + if decl_type is not None else None) - decl_type = decl.find(rf"{{{SRC_NS}}}type") if decl is not None else None - decl_type_name = decl_type.find(rf"{{{SRC_NS}}}name") if decl_type is not None else None decl_type_name_txt = ( decl_type_name.text if decl_type_name is not None and decl_type_name.text - else "") + else '') - decl_type_modifier = (decl_type.find(rf"{{{SRC_NS}}}modifier") + decl_type_modifier = (decl_type.find(f'{{{SRC_NS}}}modifier') if decl_type is not None else None) decl_type_modifier_txt = (decl_type_modifier.text if decl_type_modifier is not None and decl_type_modifier.text - else "") + else '') - if re.search(r"\*|\&", decl_type_modifier_txt): + if decl_type_modifier_txt in {'*', '&'}: parameters_passed_by_reference.append( { - "type": decl_type_name_txt, - "modifier": decl_type_modifier_txt, - "name": decl_name_txt}) + 'type': decl_type_name_txt, + 'modifier': decl_type_modifier_txt, + 'name': decl_name_txt}) - if decl_name_txt != "": + if decl_name_txt != '': parameter_declarations.append( { - "type": decl_type_name_txt, - "modifier": decl_type_modifier_txt, - "name": decl_name_txt}) + 'type': decl_type_name_txt, + 'modifier': decl_type_modifier_txt, + 'name': decl_name_txt}) return { - "parameters" : parameter_declarations, - "parameters_passed_by_reference": parameters_passed_by_reference} + 'parameters' : parameter_declarations, + 'parameters_passed_by_reference': parameters_passed_by_reference} def _parse_function_call(element): call_data = {} - if re.search(rf"{{{SRC_NS}}}call", str(element)): - call_name = element.find(rf"{{{SRC_NS}}}name") + if element.tag == f'{{{SRC_NS}}}call': + call_name = element.find(f'{{{SRC_NS}}}name') call_name_txt = _get_full_name_text_from_name(call_name) - - if call_name_txt not in list(call_data) and call_name_txt != "": + if call_name_txt not in list(call_data) and call_name_txt != '': call_data[call_name_txt] ={ - "cumulative_args": [] + 'cumulative_args': [] } - if call_name_txt != "": - call_arg_list = element.find(rf"{{{SRC_NS}}}argument_list") + if call_name_txt != '': + call_arg_list = element.find(f'{{{SRC_NS}}}argument_list') - call_args = (call_arg_list.findall(rf"{{{SRC_NS}}}argument") + call_args = (call_arg_list.findall(f'{{{SRC_NS}}}argument') if call_arg_list is not None else []) for arg in call_args: - arg_expr = arg.find(rf"{{{SRC_NS}}}expr") - arg_expr_name = (arg_expr.find(rf"{{{SRC_NS}}}name") + arg_expr = arg.find(f'{{{SRC_NS}}}expr') + arg_expr_name = (arg_expr.find(f'{{{SRC_NS}}}name') if arg_expr is not None else None) arg_expr_name_txt = (arg_expr_name.text if arg_expr_name is not None and arg_expr_name.text is not None - else "") + else '') - if arg_expr_name_txt != "": - call_data[call_name_txt]["cumulative_args"] = [ - *call_data[call_name_txt]["cumulative_args"], + if arg_expr_name_txt != '': + call_data[call_name_txt]['cumulative_args'] = [ + *call_data[call_name_txt]['cumulative_args'], arg_expr_name_txt] for key in list(call_data): - call_data[key]["cumulative_args"] = list(set(call_data[key]["cumulative_args"])) + call_data[key]['cumulative_args'] = list( + set(call_data[key]['cumulative_args'])) if call_data != {}: return call_data @@ -254,21 +260,24 @@ def _parse_function_call(element): def _parse_macro_call(element, language): macro_calls = {} - if re.search(rf"{{{SRC_NS}}}macro", element.tag): - macro_arg_list = element.find(rf"{{{SRC_NS}}}argument_list") - macro_args = (macro_arg_list.findall(rf"{{{SRC_NS}}}argument") + if element.tag == f'{{{SRC_NS}}}macro': + macro_arg_list = element.find(f'{{{SRC_NS}}}argument_list') + macro_args = (macro_arg_list.findall(f'{{{SRC_NS}}}argument') if macro_arg_list is not None else None) if macro_args is not None: for arg in macro_args: - arg_text = arg.text if arg is not None and arg.text is not None else "" + arg_text = (arg.text + if arg is not None and arg.text is not None else '') - if arg_text != "": + if arg_text != '': srcml = _get_srcml(arg_text, language) rootet = ElementTree.fromstring(srcml) - if re.search(r"{(.)+}", arg_text, flags=re.MULTILINE|re.DOTALL): + if re.search(r'{(.)+}', + arg_text, + flags=re.MULTILINE|re.DOTALL): for child in rootet.iter(): call = _parse_function_call(child) @@ -285,72 +294,78 @@ def _parse_declaration( parent_struct_name = '', parent_struct_type = '', belongs_to_file = ''): - if re.search(rf"{{{SRC_NS}}}decl_stmt|control|struct", element.tag): + if element.tag in { + f'{{{SRC_NS}}}decl_stmt', + f'{{{SRC_NS}}}control', + f'{{{SRC_NS}}}struct' + }: decls = [] - if re.search(rf"{{{SRC_NS}}}control", element.tag): - control_init = element.find(rf"{{{SRC_NS}}}init") - control_init_decls = control_init.findall(rf"{{{SRC_NS}}}decl") + if element.tag == f'{{{SRC_NS}}}control': + control_init = element.find(f'{{{SRC_NS}}}init') + control_init_decls = control_init.findall(f'{{{SRC_NS}}}decl') decls = [*decls, *control_init_decls] - if re.search(rf"{{{SRC_NS}}}struct", element.tag): - struct_decls = element.findall(rf"{{{SRC_NS}}}decl") + if element.tag == f'{{{SRC_NS}}}struct': + struct_decls = element.findall(f'{{{SRC_NS}}}decl') decls = [*decls, *struct_decls] - decls = [*decls, *element.findall(rf"{{{SRC_NS}}}decl")] + decls = [*decls, *element.findall(f'{{{SRC_NS}}}decl')] for decl in decls: - decl_type = decl.find(rf"{{{SRC_NS}}}type") if decl is not None else None + decl_type = (decl.find(f'{{{SRC_NS}}}type') + if decl is not None else None) - decl_names = decl.findall(rf"{{{SRC_NS}}}name") if decl is not None else None + decl_names = (decl.findall(f'{{{SRC_NS}}}name') + if decl is not None else None) - type_specifier = (decl_type.find(rf"{{{SRC_NS}}}specifier") + type_specifier = (decl_type.find(f'{{{SRC_NS}}}specifier') if decl_type is not None else None) type_specifier_txt = (type_specifier.text - if type_specifier is not None and type_specifier.text is not None - else "") + if type_specifier is not None and + type_specifier.text is not None + else '') + + type_name = (decl_type.find(f'{{{SRC_NS}}}name') + if decl_type is not None else None) - type_name = decl_type.find(rf"{{{SRC_NS}}}name") if decl_type is not None else None type_name_txt = (type_name.text if type_name is not None and type_name.text is not None - else "") + else '') index_tag = None - index_str = "" + index_str = '' - if type_name_txt == "" and type_name is not None: - i_type_name = type_name.find(rf"{{{SRC_NS}}}name") + if type_name_txt == '' and type_name is not None: + i_type_name = type_name.find(f'{{{SRC_NS}}}name') type_name_txt = (i_type_name.text - if i_type_name is not None and i_type_name.text is not None - else "") + if i_type_name is not None and + i_type_name.text is not None + else '') - type_name_index = type_name.find(rf"{{{SRC_NS}}}index") + type_name_index = type_name.find(f'{{{SRC_NS}}}index') index_tag = type_name_index if type_name_index is not None: - for i_str in type_name_index.itertext(): - index_str += i_str + index_str = ''.join( + i_str for i_str in type_name_index.itertext()) - type_modifier = (decl_type.find(rf"{{{SRC_NS}}}modifier") + type_modifier = (decl_type.find(f'{{{SRC_NS}}}modifier') if decl_type is not None else None) type_modifier_txt = (type_modifier.text if type_modifier is not None and type_modifier.text is not None - else "") - - decl_pos = (decl.attrib[rf"{{{POS_NS}}}start"].split(':') - if rf"{{{POS_NS}}}start" in decl.attrib.keys() - else [-1, -1]) + else '') + decl_pos = tuple(_get_span(decl)[0]) decl_pos_row = int(decl_pos[0]) - - if type_name != "": + if type_name != '': for name in decl_names: child_name = _get_name_from_nested_name(name) child_name_txt = (child_name.text @@ -359,612 +374,660 @@ def _parse_declaration( if child_name_txt != '': return { - "specifier": type_specifier_txt, - "type": type_name_txt, - "modifier": type_modifier_txt, - "name": child_name_txt, - "index_tag": index_tag, - "index_str": index_str, - "signature": re.sub("/s+", " ", " ".join( + 'specifier': type_specifier_txt, + 'type': type_name_txt, + 'modifier': type_modifier_txt, + 'name': child_name_txt, + 'index_tag': index_tag, + 'index_str': index_str, + 'signature': re.sub('/s+', ' ', ' '.join( [ type_specifier_txt, type_name_txt, type_modifier_txt, child_name_txt]).rstrip()), - "pos_row": decl_pos_row, - "file_name": belongs_to_file, - "parent_structure_name": parent_struct_name, - "parent_structure_type": parent_struct_type, + 'pos_row': decl_pos_row, + 'file_name': belongs_to_file, + 'parent_structure_name': parent_struct_name, + 'parent_structure_type': parent_struct_type, } return None -def _parse_el_for_global_variable_write( - element, - function_declaration_list, - parameters_passed_by_reference, - pointer_declarations, - calls, - variable_writes, - parent_declarations): - - decl_names = [d["name"] for d in [*function_declaration_list, *pointer_declarations]] - - expr_str = "" - - fan_out_var_candidates = [] - - expr_children = [child for child in element.iter()] - expr_str = ''.join([child for child in element.itertext()]) - - expr_names = element.findall(rf"{{{SRC_NS}}}name") - operators = element.findall(rf"{{{SRC_NS}}}operator") - - incr_decr_op = next( - (op for op in operators - if op is not None and op.text is not None and re.fullmatch(r"^\+\+|\-\-$", op.text)), - None) - - incr_decr_op_txt = (incr_decr_op.text - if incr_decr_op is not None and incr_decr_op.text is not None - else '') - - incr_decr_op_pos = (incr_decr_op.attrib[rf"{{{POS_NS}}}start"].split(':') - if incr_decr_op is not None and rf"{{{POS_NS}}}start" in incr_decr_op.attrib.keys() - else [-1, -1]) - - incr_decr_op_row = int(incr_decr_op_pos[0]) - incr_decr_op_col = int(incr_decr_op_pos[1]) - - equals_ops = [op for op in operators - if op is not None - and op.text is not None - and re.fullmatch(r"^\=|\+\=|\-\=|\*\=|\\\=$", op.text)] - - if len(equals_ops) == 0: - equals_ops = [None] - - last_equals_op_txt = (equals_ops[-1].text - if equals_ops[-1] is not None and equals_ops[-1].text is not None - else '') - - last_equals_op_pos = (equals_ops[-1].attrib[rf"{{{POS_NS}}}start"].split(':') - if equals_ops[-1] is not None and rf"{{{POS_NS}}}start" in equals_ops[-1].attrib.keys() - else [-1, -1]) - - last_equals_op_row = int(last_equals_op_pos[0]) - last_equals_op_col = int(last_equals_op_pos[1]) - - first_equals_op_pos = (equals_ops[0].attrib[rf"{{{POS_NS}}}start"].split(':') - if equals_ops[0] is not None - and rf"{{{POS_NS}}}start" in equals_ops[0].attrib.keys() - else [-1, -1]) - - first_equals_op_col = int(first_equals_op_pos[1]) - - if last_equals_op_txt != '' or incr_decr_op_txt != '': - if len(expr_names) > 0: - first_expr_name = expr_names[0] - first_expr_name_txt = '' - - for name in expr_names: - name_pos = name.attrib[rf"{{{POS_NS}}}start"].split(':') - name_pos_row = int(name_pos[0]) - name_pos_col = int(name_pos[1]) - - expr_sub_names = name.findall(rf"{{{SRC_NS}}}name") - - expr_sub_name = (_get_name_from_nested_name(expr_sub_names[0]) - if len(expr_sub_names) > 1 - else name) - - expr_sub_name_pos =( expr_sub_name.attrib[rf"{{{POS_NS}}}start"].split(':') - if expr_sub_name is not None - and rf"{{{POS_NS}}}start" in expr_sub_name.attrib.keys() - else [-1, -1]) - - expr_sub_name_pos_row = int(expr_sub_name_pos[0]) - expr_sub_name_pos_col = int(expr_sub_name_pos[1]) - - expr_index = name.find(rf"{{{SRC_NS}}}index") - - expr_index_pos = (expr_index.attrib[rf"{{{POS_NS}}}start"].split(':') - if expr_index is not None and rf"{{{POS_NS}}}start" in expr_index.keys() - else [-1, -1]) - - expr_index_pos_row = int(expr_index_pos[0]) - expr_index_pos_col = int(expr_index_pos[1]) - - first_expr_name_txt = (expr_sub_name.text - if expr_sub_name is not None - and expr_sub_name.text is not None - else ''.join([child_txt for child_txt in first_expr_name.itertext()])) - - name_signature = ''.join([child_txt for child_txt in name.itertext()]) - - name_op = name.findall(rf"{{{SRC_NS}}}operator") - - member_access_op = next( - (op for op in name_op - if op is not None and op.text is not None - and (op.text == '->' or op.text == '.')), None) - - member_access_op_pos = (member_access_op.attrib[rf"{{{POS_NS}}}start"].split(':') - if member_access_op is not None - and rf"{{{POS_NS}}}start" in member_access_op.attrib.keys() - else [-1, -1]) +class FunctionCollector: + def __init__(self, language): + self.language = language + (self.RESERVED_FUNCTIONS, + self.RESERVED_STREAMS, + self.RESERVED_KEYWORDS) = _get_constants_from_language(language) - member_access_op_pos_row = int(member_access_op_pos[0]) - member_access_op_pos_col = int(member_access_op_pos[1]) + def _parse_el_for_global_variable_write( + self, + element, + function_declaration_list, + pointer_declarations, + variable_writes): + decl_names = [d['name'] for d in [ + *function_declaration_list, + *pointer_declarations]] - members_accessed = [] - expr_mod_statements = [] - indices = [] + expr_str = '' + + fan_out_var_candidates = [] - index_accessed_str = '' + expr_children = [child for child in element.iter()] + expr_str = ''.join([child for child in element.itertext()]) - if (member_access_op is not None - and member_access_op_pos_row == expr_sub_name_pos_row - and member_access_op_pos_col > expr_sub_name_pos_col - and (member_access_op_pos_col < first_equals_op_col or incr_decr_op_col != -1) - ): + expr_names = element.findall(f'{{{SRC_NS}}}name') + operators = element.findall(f'{{{SRC_NS}}}operator') + + incr_decr_op = next( + (op for op in operators + if op is not None and + op.text is not None and + op.text in {'++', '--'}), + None) + + incr_decr_op_txt = (incr_decr_op.text + if incr_decr_op is not None and incr_decr_op.text is not None + else '') + + incr_decr_op_pos = tuple(_get_span(incr_decr_op)[0]) + incr_decr_op_row = int(incr_decr_op_pos[0]) + incr_decr_op_col = int(incr_decr_op_pos[1]) + + equals_ops = [op for op in operators + if op is not None + and op.text is not None + and op.text in { + '=', + '+=', + '-=', + '*=', + '\\=' + }] + + last_equals_op_row = last_equals_op_col = first_equals_op_col = -1 + last_equals_op_txt = '' + + if len(equals_ops) > 0: + last_equals_op_txt = (equals_ops[-1].text + if equals_ops[-1].text is not None else '') + + last_equals_op_pos = tuple(_get_span(equals_ops[-1])[0]) + last_equals_op_row = int(last_equals_op_pos[0]) + last_equals_op_col = int(last_equals_op_pos[1]) + + first_equals_op_pos = tuple(_get_span(equals_ops[0])[0]) + first_equals_op_col = int(first_equals_op_pos[1]) + + if last_equals_op_txt != '' or incr_decr_op_txt != '': + if len(expr_names) > 0: + first_expr_name = expr_names[0] + first_expr_name_txt = '' + + for name in expr_names: + name_pos = tuple(_get_span(name)[0]) + name_pos_row = int(name_pos[0]) + name_pos_col = int(name_pos[1]) + + expr_sub_names = name.findall(f'{{{SRC_NS}}}name') + + expr_sub_name = (_get_name_from_nested_name( + expr_sub_names[0]) + + if len(expr_sub_names) > 1 + else name) + + expr_sub_name_pos = tuple(_get_span(expr_sub_name)[0]) + + expr_sub_name_pos_row = int(expr_sub_name_pos[0]) + expr_sub_name_pos_col = int(expr_sub_name_pos[1]) + + expr_index = name.find(f'{{{SRC_NS}}}index') + + expr_index_pos = tuple(_get_span(expr_index)[0]) + expr_index_pos_row = int(expr_index_pos[0]) + expr_index_pos_col = int(expr_index_pos[1]) + + first_expr_name_txt = (expr_sub_name.text + if expr_sub_name is not None + and expr_sub_name.text is not None + else ''.join(child_txt + for child_txt in first_expr_name.itertext())) + + name_signature = ''.join(child_txt + for child_txt in name.itertext()) + + name_op = name.findall(f'{{{SRC_NS}}}operator') + + access_op = next( + (op for op in name_op + if op is not None and op.text is not None + and (op.text == '->' or op.text == '.')), None) + + access_op_pos = tuple( + _get_span(access_op)[0]) + access_op_pos_row = int(access_op_pos[0]) + access_op_pos_col = int(access_op_pos[1]) - member_accessed_str = '' + members_accessed = [] + expr_mod_statements = [] + indices = [] - for child in expr_children: - child_pos = (child.attrib[rf"{{{POS_NS}}}start"].split(':') - if rf"{{{POS_NS}}}start" in child.attrib.keys() - else [-1, -1]) + index_accessed_str = '' + + if (access_op is not None + and access_op_pos_row == expr_sub_name_pos_row + and access_op_pos_col > expr_sub_name_pos_col + and ( + access_op_pos_col < first_equals_op_col or + incr_decr_op_col != -1) + ): + + member_accessed_str = '' - child_pos_row = int(child_pos[0]) - child_pos_col = int(child_pos[1]) + for child in expr_children: + child_pos = tuple(_get_span(child)[0]) + child_pos_row = int(child_pos[0]) + child_pos_col = int(child_pos[1]) - child_txt = ''.join(child.itertext()) if child.text is None else child.text + child_txt = (''.join(child.itertext()) + if child.text is None else child.text) - if ( - child_pos_row == member_access_op_pos_row and - child_pos_col > member_access_op_pos_col and - (child_pos_col < first_equals_op_col or incr_decr_op_col != -1)): + if ( + child_pos_row == access_op_pos_row and + child_pos_col > access_op_pos_col and + ( + child_pos_col < first_equals_op_col or + incr_decr_op_col != -1)): - if child_txt != '': - if ( - expr_index_pos_col > member_access_op_pos_col and - expr_index_pos_row == member_access_op_pos_row): + if (child_txt != '' and + expr_index_pos_col > access_op_pos_col and + expr_index_pos_row == access_op_pos_row): index_accessed_str += child_txt else: member_accessed_str += child_txt - elif ( - child_pos_col < first_equals_op_col and - expr_index_pos_col < first_equals_op_col and - expr_index_pos_col != -1): - if child_txt != '': - if expr_index_pos_row == member_access_op_pos_row: - index_accessed_str += child_txt - - if index_accessed_str != '': - indices.append(index_accessed_str) - - if member_accessed_str != '': - members_accessed.append(member_accessed_str) - - elif member_access_op is None and expr_index is None: - expr_mod_statements.append(expr_str) - - if first_expr_name_txt != "this" and first_expr_name_txt not in decl_names: - fan_out_var_candidates.append({ - "name": first_expr_name_txt, - "signature": name_signature, - "row_pos": name_pos_row, - "col_pos": name_pos_col, - "members_accessed": members_accessed, - "indices" : indices, - "expr_mod_statements": expr_mod_statements - }) - - for cand in fan_out_var_candidates: - if ( - last_equals_op_txt != '' and - last_equals_op_col > cand["col_pos"] and - last_equals_op_row == cand["row_pos"]): - if cand["name"] not in variable_writes.keys(): - variable_writes[cand["name"]] = { - 'expressions': cand["expr_mod_statements"], - 'members_modified': cand["members_accessed"], - 'indices_modified': cand["indices"] - } - else: - variable_writes[cand["name"]]['expressions'] = [ - *variable_writes[cand["name"]]['expressions'], - *cand["expr_mod_statements"]] - - variable_writes[cand["name"]]['members_modified'] = [ - *variable_writes[cand["name"]]['members_modified'], - *cand["members_accessed"]] - - variable_writes[cand["name"]]['indices_modified'] = [ - *variable_writes[cand["name"]]['indices_modified'], - *cand["indices"]] - - elif incr_decr_op_txt and incr_decr_op_row == cand["row_pos"]: - if cand["name"] not in variable_writes.keys(): - variable_writes[cand["name"]] = { - 'expressions': cand["expr_mod_statements"], - 'members_modified': cand["members_accessed"], - 'indices_modified': cand["indices"] - } - else: - variable_writes[cand["name"]]['expressions'] = [ - *variable_writes[cand["name"]]['expressions'], - *cand["expr_mod_statements"]] - - variable_writes[cand["name"]]['members_modified'] = [ - *variable_writes[cand["name"]]['members_modified'], - *cand["members_accessed"]] - - variable_writes[cand["name"]]['indices_modified'] = [ - *variable_writes[cand["name"]]['indices_modified'], - *cand["indices"]] - -def _parse_el_for_global_variable_read( - expr, - calls, - function_declarations, - pointer_declarations, - params, - local_function_names, - enums, - read_variable_names, - function_throws_exception_names, - parent_declarations): - declaration_names = [d["name"] for d in function_declarations] - parent_declaration_var_names= [d["name"] for d in parent_declarations if d is not None] - param_names = [p["name"] for p in params] - - call_arg_names = [] - - for key in calls.keys(): - call_arg_names = [*call_arg_names, *calls[key]["cumulative_args"]] - - expr_names = expr.findall(rf"{{{SRC_NS}}}name") - - ops = expr.findall(rf"{{{SRC_NS}}}operator") if expr is not None else None - - last_op = next( - (op for op in list(reversed(ops)) - if op is not None and - op.text is not None and - re.fullmatch(r"^\=|\+\=|\-\=|\*\=|\\\=$", op.text)), None) - - incr_decr_op = next(( - op for op in ops if op is not None and - op.text is not None and - re.fullmatch(r"^\+\+|\-\-$", op.text)), None) - - incr_decr_op_pos = (incr_decr_op.attrib[rf"{{{POS_NS}}}start"].split(':') - if incr_decr_op is not None and - rf"{{{POS_NS}}}start" in incr_decr_op.attrib.keys() - else [-1, -1]) - - incr_decr_op_col = int(incr_decr_op_pos[1]) - - equal_op_pos = ( - last_op.attrib[rf'{{{POS_NS}}}start'].split(':') - if last_op is not None and - rf'{{{POS_NS}}}start' in last_op.attrib.keys() - else [-1, -1]) - - equal_op_pos_col = int(equal_op_pos[1]) - - for arg in call_arg_names: - if( - not isinstance(arg, (int, float, bytes)) and - arg != "" and - arg is not None and - arg not in C_RESERVED_KEYWORDS and - arg not in C_LIB_STREAMS and - not re.match(r"^null$", arg, flags=re.IGNORECASE) and - arg not in declaration_names and - arg not in param_names and - ( - - ( - arg not in list(calls) and - arg not in C_LIB_FUNCTIONS and - arg not in local_function_names and - arg not in enums and - arg not in function_throws_exception_names - ) - or - arg in parent_declaration_var_names - or - arg in param_names - ) - ): - read_variable_names.append(arg) - - for name in expr_names: - name_txt = _get_full_name_text_from_name(name) - - name_pos = ( - name.attrib[rf'{{{POS_NS}}}start'].split(':') - if rf'{{{POS_NS}}}start' in name.attrib.keys() - else [-1, -1]) - - name_pos_col = int(name_pos[1]) - - name_member_access_txt = re.split(r"\-\>|\[|\.", name_txt, 1)[0] - - if( - name_pos_col >= equal_op_pos_col and - equal_op_pos_col <= incr_decr_op_col and - name_member_access_txt != "" and - name_member_access_txt is not None and - name_member_access_txt not in C_RESERVED_KEYWORDS and - name_member_access_txt not in C_LIB_STREAMS and - not re.match(r"^null$", name_member_access_txt, flags=re.IGNORECASE) and - name_member_access_txt not in declaration_names and + elif ( + child_pos_col < first_equals_op_col and + expr_index_pos_col < first_equals_op_col and + expr_index_pos_col != -1 and + child_txt != '' and + expr_index_pos_row == access_op_pos_row): + index_accessed_str += child_txt + + if index_accessed_str != '': + indices.append(index_accessed_str) + + if member_accessed_str != '': + members_accessed.append(member_accessed_str) + + elif access_op is None and expr_index is None: + expr_mod_statements.append(expr_str) + + if (first_expr_name_txt != 'this' and + first_expr_name_txt not in decl_names): + fan_out_var_candidates.append({ + 'name': first_expr_name_txt, + 'signature': name_signature, + 'row_pos': name_pos_row, + 'col_pos': name_pos_col, + 'members_accessed': members_accessed, + 'indices' : indices, + 'expr_mod_statements': expr_mod_statements + }) + + for cand in fan_out_var_candidates: + if ( + last_equals_op_txt != '' and + last_equals_op_col > cand['col_pos'] and + last_equals_op_row == cand['row_pos']): + if cand['name'] not in variable_writes.keys(): + variable_writes[cand['name']] = { + 'expressions': cand['expr_mod_statements'], + 'members_modified': cand['members_accessed'], + 'indices_modified': cand['indices'] + } + else: + variable_writes[cand['name']]['expressions'] = [ + *variable_writes[cand['name']]['expressions'], + *cand['expr_mod_statements']] + + variable_writes[cand['name']]['members_modified'] = [ + *variable_writes[cand['name']]['members_modified'], + *cand['members_accessed']] + + variable_writes[cand['name']]['indices_modified'] = [ + *variable_writes[cand['name']]['indices_modified'], + *cand['indices']] + + elif incr_decr_op_txt and incr_decr_op_row == cand['row_pos']: + if cand['name'] not in variable_writes.keys(): + variable_writes[cand['name']] = { + 'expressions': cand['expr_mod_statements'], + 'members_modified': cand['members_accessed'], + 'indices_modified': cand['indices'] + } + else: + variable_writes[cand['name']]['expressions'] = [ + *variable_writes[cand['name']]['expressions'], + *cand['expr_mod_statements']] + + variable_writes[cand['name']]['members_modified'] = [ + *variable_writes[cand['name']]['members_modified'], + *cand['members_accessed']] + + variable_writes[cand['name']]['indices_modified'] = [ + *variable_writes[cand['name']]['indices_modified'], + *cand['indices']] + + def _parse_el_for_global_variable_read( + self, + expr, + calls, + function_declarations, + params, + local_function_names, + enums, + read_variable_names, + throws_exception_names, + parent_declarations): + declaration_names = [d['name'] for d in function_declarations] + + parent_declaration_var_names= [d['name'] + for d in parent_declarations if d is not None] + + param_names = [p['name'] for p in params] + + call_arg_names = [] + + for key in calls.keys(): + call_arg_names = [*call_arg_names, *calls[key]['cumulative_args']] + + expr_names = expr.findall(f'{{{SRC_NS}}}name') + + ops = (expr.findall(f'{{{SRC_NS}}}operator') + if expr is not None else None) + + last_op = next( + (op for op in list(reversed(ops)) + if op is not None and + op.text is not None and + op.text in {'=','+=','-=','*=','\\='}), None) + + incr_decr_op = next(( + op for op in ops if op is not None and + op.text is not None and + op.text in {'++', '--'}), None) + + incr_decr_op_pos = tuple(_get_span(incr_decr_op)[0]) + incr_decr_op_col = int(incr_decr_op_pos[1]) + + equal_op_pos = tuple(_get_span(last_op)[0]) + equal_op_pos_col = int(equal_op_pos[1]) + + for arg in call_arg_names: + if( + not isinstance(arg, (int, float, bytes)) and + arg != '' and + arg is not None and + arg not in self.RESERVED_KEYWORDS and + arg not in self.RESERVED_STREAMS and + not re.match(r'^null$', arg, flags=re.IGNORECASE) and + arg not in declaration_names and + arg not in param_names and ( ( - name_member_access_txt not in list(calls) and - name_member_access_txt not in C_LIB_FUNCTIONS and - name_member_access_txt not in local_function_names and - name_member_access_txt not in enums and - name_member_access_txt not in function_throws_exception_names + arg not in list(calls) and + arg not in self.RESERVED_FUNCTIONS and + arg not in local_function_names and + arg not in enums and + arg not in throws_exception_names ) or - name_member_access_txt in parent_declaration_var_names + arg in parent_declaration_var_names or - name_member_access_txt in param_names + arg in param_names ) - ): - read_variable_names.append(name_txt) - - read_variable_names = list(set([*read_variable_names])) - -def _compile_acyclical_paths_tree(root): - root_paths = [] - - root_block = root.find(rf"{{{SRC_NS}}}block") - root_block_content = ( - root_block.find(rf"{{{SRC_NS}}}block_content") - if root_block is not None - else root_block if root_block is not None - else root) - - for child in list(root_block_content): - if re.search(rf'{{{SRC_NS}}}if_stmt', child.tag): - root_paths = [ - *root_paths, - *_compile_acyclical_paths_tree(child)] - - elif re.search (rf'{{{SRC_NS}}}if|else', child.tag): - if_type = child.attrib["type"] if "type" in child.attrib.keys() else "" - - root_paths.append({ - "type": child.tag, - "if_type": if_type, - "children": _compile_acyclical_paths_tree(child) - }) - elif re.search(rf'{{{SRC_NS}}}for|while|do', child.tag): - root_paths.append({ - "type": child.tag, - "children": _compile_acyclical_paths_tree(child) - }) - elif re.search(rf"{{{SRC_NS}}}switch", child.tag): - root_paths.append({ - "type": child.tag, - "children": _compile_acyclical_paths_tree(child) - }) - elif re.search(rf'{{{SRC_NS}}}case|default', child.tag): - root_paths.append({ - "type": child.tag, - "children": _compile_acyclical_paths_tree(child) - }) - elif re.search(rf"{{{SRC_NS}}}ternary", child.tag): - root_paths.append({ - "type": child.tag, - "children": _compile_acyclical_paths_tree(child) - }) - elif re.search(rf"{{{SRC_NS}}}then", child.tag): - root_paths.append({ - "type": child.tag, - "children": [] - }) - - return root_paths - -def get_function_global_var_ops_and_paths( - function_element, - function_dict, - all_local_call_names, - parent_struct_name, - parent_struct_type, - parent_declarations, - file_name, - enums, - local_function_names, - language): - - if re.search(rf"{{{SRC_NS}}}function|constructor", function_element.tag): - func_sig =_get_signature(function_element) - func_name = _get_name(function_element) - block = function_element.find(rf"{{{SRC_NS}}}block") - - has_return_value = False - - acyc_paths = _compile_acyclical_paths_tree(function_element) - - throws_exception_names = [] - declarations = [] - pointer_decls = [] - - calls = {} - macro_calls = {} - - global_variable_writes = {} - global_variable_reads = [] - - if block is not None: - param_data = _get_param_data(function_element) - param_count = len(param_data["parameters"]) - - for func_child in function_element.iter(): - decl = _parse_declaration( - func_child, - parent_struct_name=parent_struct_name, - parent_struct_type=parent_struct_type, - belongs_to_file=file_name) - - call = _parse_function_call(func_child) - macros = _parse_macro_call(func_child, language) - throws = _get_throws_expression_names(func_child) - - if throws != []: - throws_exception_names = [*throws_exception_names, *throws] - - if decl is not None: - if decl["modifier"] == "*": - pointer_decls.append(decl) - else: - declarations.append(decl) - - if call is not None: - calls = {**calls, **call} - all_local_call_names = [*all_local_call_names, *call.keys()] - - if macros is not None: - macro_calls = {**macro_calls, **macros} - - if re.search(rf'{{{SRC_NS}}}return', func_child.tag) and has_return_value is False: - return_expr = func_child.find(rf"{{{SRC_NS}}}expr") - if return_expr is not None: - has_return_value = True - - if re.search(rf"{{{SRC_NS}}}expr|decl_stmt", func_child.tag): - _parse_el_for_global_variable_write( - element = func_child, - function_declaration_list = declarations, - parameters_passed_by_reference = param_data[ - "parameters_passed_by_reference"], - pointer_declarations = pointer_decls, - calls = calls, - variable_writes = global_variable_writes, - parent_declarations = parent_declarations + ): + read_variable_names.append(arg) + + for name in expr_names: + name_txt = _get_full_name_text_from_name(name) + + name_pos = tuple(_get_span(name)[0]) + name_pos_col = int(name_pos[1]) + + name_accessed_txt = re.split(r'\-\>|\[|\.', name_txt, 1)[0] + + if( + name_pos_col >= equal_op_pos_col and + equal_op_pos_col <= incr_decr_op_col and + name_accessed_txt != '' and + name_accessed_txt is not None and + name_accessed_txt not in self.RESERVED_KEYWORDS and + name_accessed_txt not in self.RESERVED_STREAMS and + not re.match( + r'^null$', + name_accessed_txt, + flags=re.IGNORECASE) and + name_accessed_txt not in declaration_names and + ( + ( + name_accessed_txt not in list(calls) and + name_accessed_txt not in self.RESERVED_FUNCTIONS and + name_accessed_txt not in local_function_names and + name_accessed_txt not in enums and + name_accessed_txt not in throws_exception_names ) - - _parse_el_for_global_variable_read( - expr = func_child, - calls = calls, - function_declarations = declarations, - pointer_declarations = pointer_decls, - params = param_data["parameters"], - local_function_names = local_function_names, - enums = enums, - read_variable_names = global_variable_reads, - function_throws_exception_names = throws_exception_names, - parent_declarations = parent_declarations + or + name_accessed_txt in parent_declaration_var_names + or + name_accessed_txt in param_names + ) + ): + read_variable_names.append(name_txt) + + read_variable_names = list(set([*read_variable_names])) + + def _compile_acyclical_paths_tree(self, root): + root_paths = [] + + root_block = root.find(f'{{{SRC_NS}}}block') + root_block_content = ( + root_block.find(f'{{{SRC_NS}}}block_content') + if root_block is not None + else root_block if root_block is not None + else root) + + for child in list(root_block_content): + if child.tag == f'{{{SRC_NS}}}if_stmt': + root_paths = [ + *root_paths, + *self._compile_acyclical_paths_tree(child)] + + elif child.tag in { + f'{{{SRC_NS}}}if', + f'{{{SRC_NS}}}else' + }: + if_type = (child.attrib['type'] + if 'type' in child.attrib.keys() else '') + + root_paths.append({ + 'type': child.tag, + 'if_type': if_type, + 'children': self._compile_acyclical_paths_tree(child) + }) + elif child.tag in { + f'{{{SRC_NS}}}for', + f'{{{SRC_NS}}}while', + f'{{{SRC_NS}}}do' + }: + root_paths.append({ + 'type': child.tag, + 'children': self._compile_acyclical_paths_tree(child) + }) + elif child.tag == f'{{{SRC_NS}}}switch': + root_paths.append({ + 'type': child.tag, + 'children': self._compile_acyclical_paths_tree(child) + }) + elif child.tag in { + f'{{{SRC_NS}}}case', + f'{{{SRC_NS}}}default' + }: + root_paths.append({ + 'type': child.tag, + 'children': self._compile_acyclical_paths_tree(child) + }) + elif child.tag == f'{{{SRC_NS}}}ternary': + root_paths.append({ + 'type': child.tag, + 'children': self._compile_acyclical_paths_tree(child) + }) + elif child.tag == f'{{{SRC_NS}}}then': + root_paths.append({ + 'type': child.tag, + 'children': [] + }) + + return root_paths + + def get_function_properties( + self, + function_element, + function_dict, + all_local_call_names, + parent_struct_name, + parent_struct_type, + parent_declarations, + file_name, + enums, + local_function_names, + language): + + if function_element.tag in { + f'{{{SRC_NS}}}function', + f'{{{SRC_NS}}}constructor' + }: + func_sig =_get_signature(function_element) + func_name = _get_name(function_element) + block = function_element.find(f'{{{SRC_NS}}}block') + + has_return_value = False + + acyc_paths = self._compile_acyclical_paths_tree(function_element) + + throws_exception_names = [] + declarations = [] + pointer_decls = [] + + calls = {} + macro_calls = {} + + global_variable_writes = {} + global_variable_reads = [] + + if block is not None: + param_data = _get_param_data(function_element) + param_count = len(param_data['parameters']) + + for func_child in function_element.iter(): + decl = _parse_declaration( + func_child, + parent_struct_name=parent_struct_name, + parent_struct_type=parent_struct_type, + belongs_to_file=file_name) + + call = _parse_function_call(func_child) + macros = _parse_macro_call(func_child, language) + throws = _get_throws_expression_names(func_child) + + if throws != []: + throws_exception_names = [ + *throws_exception_names, + *throws] + + if decl is not None: + if decl['modifier'] == '*': + pointer_decls.append(decl) + else: + declarations.append(decl) + + if call is not None: + calls = {**calls, **call} + all_local_call_names = [ + *all_local_call_names, + *call.keys()] + + if macros is not None: + macro_calls = {**macro_calls, **macros} + + if f'{{{SRC_NS}}}return' and has_return_value is False: + return_expr = func_child.find(f'{{{SRC_NS}}}expr') + if return_expr is not None: + has_return_value = True + + if func_child.tag in { + f'{{{SRC_NS}}}expr', + f'{{{SRC_NS}}}decl_stmt', + }: + self._parse_el_for_global_variable_write( + element = func_child, + function_declaration_list = declarations, + pointer_declarations = pointer_decls, + variable_writes = global_variable_writes, ) - global_variable_reads = list(set(global_variable_reads)) - - if func_sig not in function_dict.keys(): - local_function_names.append(func_name) - function_dict[func_sig] = { - "signature": func_sig, - "function_name": func_name, - "param_count": param_count, - "calls": calls, - "functions_called_by": [], - "acyclical_paths_tree": acyc_paths, - "has_return": has_return_value, - "parent_structure_name": parent_struct_name, - "parent_structure_type": parent_struct_type, - "file_name": file_name, - "global_variable_writes": global_variable_writes, - "global_variable_reads": list(set(global_variable_reads)) - } - - return function_dict - -def get_functions_with_metric_properties( - root_element, - parent_struct_name, - parent_struct_type, - parent_declarations, - file_name, - local_function_names, - enums, - all_local_call_names, - language - ): - parent_name_txt = parent_struct_name - local_declarations = parent_declarations - - function_dict = {} - for child in list(root_element): - if re.search(rf"{{{SRC_NS}}}class|struct|namespace|unit", child.tag): - parent_name = child.find(rf"{{{SRC_NS}}}name") - new_parent_struct_type = re.sub(r"{.+}", "", child.tag) - new_parent_name_txt = parent_struct_name + _get_full_name_text_from_name(parent_name) - - class_declarations = [ - _parse_declaration( - element = decl, - parent_struct_name = new_parent_name_txt, - parent_struct_type = new_parent_struct_type, - belongs_to_file = file_name) - for decl in child.findall(rf"{{{SRC_NS}}}decl_stmt")] - - class_enums = [ - _parse_enum(el) - for el in child.findall(rf'{{{SRC_NS}}}enum') - ] - - local_declarations = [*parent_declarations, *class_declarations] - enums = [*enums, *class_enums] - - function_dict = {**function_dict, - **get_functions_with_metric_properties( - root_element = child, - all_local_call_names = all_local_call_names, - parent_struct_name = new_parent_name_txt, - parent_struct_type = new_parent_struct_type, - parent_declarations = local_declarations, - file_name = file_name, - local_function_names=[f["function_name"] for f in function_dict.values()], - enums = enums, - language = language)} - - if re.search(rf"{{{SRC_NS}}}block|block_content", child.tag): - function_dict = {**function_dict, - **get_functions_with_metric_properties( - root_element = child, - all_local_call_names = all_local_call_names, - parent_struct_name = parent_name_txt, - parent_struct_type = parent_struct_type, - parent_declarations = local_declarations, - file_name = file_name, - local_function_names=[f["function_name"] for f in function_dict.values()], - enums = enums, - language = language)} - - if re.search(rf"{{{SRC_NS}}}function|constructor", child.tag): - updated_function_dict = get_function_global_var_ops_and_paths( - function_element = child, - function_dict = function_dict, + self._parse_el_for_global_variable_read( + expr = func_child, + calls = calls, + function_declarations = declarations, + params = param_data['parameters'], + local_function_names = local_function_names, + enums = enums, + read_variable_names = global_variable_reads, + throws_exception_names = throws_exception_names, + parent_declarations = parent_declarations + ) + + global_variable_reads = list(set(global_variable_reads)) + + if func_sig not in function_dict.keys(): + local_function_names.append(func_name) + function_dict[func_sig] = { + 'signature': func_sig, + 'function_name': func_name, + 'param_count': param_count, + 'calls': calls, + 'functions_called_by': [], + 'acyclical_paths_tree': acyc_paths, + 'has_return': has_return_value, + 'parent_structure_name': parent_struct_name, + 'parent_structure_type': parent_struct_type, + 'global_variable_writes': global_variable_writes, + 'global_variable_reads': list( + set(global_variable_reads)) + } + + return function_dict + + def get_functions_with_metric_properties( + self, + root_element, + parent_struct_name, + parent_struct_type, + parent_declarations, + file_name, + enums, + all_local_call_names, + language + ): + parent_name_txt = parent_struct_name + local_declarations = parent_declarations + + function_dict = {} + + class_els = root_element.findall(rf'{{{SRC_NS}}}class') + struct_els = root_element.findall(rf'{{{SRC_NS}}}struct') + namespace_els = root_element.findall(rf'{{{SRC_NS}}}namespace') + unit_els = root_element.findall(rf'{{{SRC_NS}}}unit') + function_els = root_element.findall(rf'{{{SRC_NS}}}function') + block_els = root_element.findall(rf'{{{SRC_NS}}}block') + block_content_els = root_element.findall(rf'{{{SRC_NS}}}block_content') + + #Sorts by row position + root_els = sorted([ + *class_els, + *struct_els, + *namespace_els, + *unit_els, + *function_els, + *block_els, + *block_content_els + ], key = lambda el: tuple(_get_span(el)[0])[0]) + + for child in root_els: + if child.tag in { + f'{{{SRC_NS}}}class', + f'{{{SRC_NS}}}struct', + f'{{{SRC_NS}}}namespace', + f'{{{SRC_NS}}}unit' + }: + parent_name = child.find(f'{{{SRC_NS}}}name') + new_parent_struct_type = re.sub(r'{.+}', '', child.tag) + + new_parent_name_txt = ( + parent_struct_name + + _get_full_name_text_from_name(parent_name)) + + class_declarations = [ + _parse_declaration( + element = decl, + parent_struct_name = new_parent_name_txt, + parent_struct_type = new_parent_struct_type, + belongs_to_file = file_name) + for decl in child.findall(f'{{{SRC_NS}}}decl_stmt')] + + class_enums = [ + _parse_enum(el) + for el in child.findall(f'{{{SRC_NS}}}enum') + ] + + local_declarations = [ + *parent_declarations, + *class_declarations] + + enums = [*enums, *class_enums] + + function_dict = {**function_dict, + **self.get_functions_with_metric_properties( + root_element = child, + all_local_call_names = all_local_call_names, + parent_struct_name = new_parent_name_txt, + parent_struct_type = new_parent_struct_type, + parent_declarations = local_declarations, + file_name = file_name, + enums = enums, + language = language)} + + if child.tag in { + f'{{{SRC_NS}}}block', + f'{{{SRC_NS}}}block_content' + }: + function_dict = {**function_dict, + **self.get_functions_with_metric_properties( + root_element = child, all_local_call_names = all_local_call_names, parent_struct_name = parent_name_txt, parent_struct_type = parent_struct_type, - parent_declarations = parent_declarations, + parent_declarations = local_declarations, file_name = file_name, - local_function_names=[f["function_name"] for f in function_dict.values()], enums = enums, - language = language) - - function_dict = {**function_dict, **updated_function_dict} - - return function_dict + language = language)} + + if child.tag in { + f'{{{SRC_NS}}}function', + f'{{{SRC_NS}}}constructor' + }: + updated_function_dict = self.get_function_properties( + function_element = child, + function_dict = function_dict, + all_local_call_names = all_local_call_names, + parent_struct_name = parent_name_txt, + parent_struct_type = parent_struct_type, + parent_declarations = parent_declarations, + file_name = file_name, + local_function_names=[ + f['function_name'] + for f in function_dict.values()], + enums = enums, + language = language) + + function_dict = {**function_dict, **updated_function_dict} + + return function_dict class SrcMLParser: def __init__(self, language): @@ -1003,7 +1066,7 @@ def get_functions_with_properties(self, file_name, contents): if srcml is None: logger.debug('Srcml parser is none') - return "None" + return None else: logger.debug('Successfully retrieved srcml parser') @@ -1015,22 +1078,24 @@ def get_functions_with_properties(self, file_name, contents): parent_struct_name=file_name, parent_struct_type='file', belongs_to_file='file_name') - for decl in root.findall(rf'{{{SRC_NS}}}decl_stmt')] + for decl in root.findall(f'{{{SRC_NS}}}decl_stmt')] root_enums = [ _parse_enum(el) - for el in root.findall(rf'{{{SRC_NS}}}enum') + for el in root.findall(f'{{{SRC_NS}}}enum') ] - func_dict = get_functions_with_metric_properties( + func_collect = FunctionCollector(language=self._language) + + func_dict = func_collect.get_functions_with_metric_properties( root_element = root, parent_struct_name = file_name, parent_struct_type= 'file', parent_declarations=root_declarations, all_local_call_names=[], - local_function_names=[], language=self._language, enums = root_enums, - file_name = file_name) + file_name = file_name + ) return func_dict diff --git a/parser/parser/schemas.py b/parser/parser/schemas.py index 15375a2..2dbec11 100644 --- a/parser/parser/schemas.py +++ b/parser/parser/schemas.py @@ -1,4 +1,4 @@ -from marshmallow import Schema, fields, post_load, EXCLUDE +from marshmallow import Schema, fields, post_load, INCLUDE from .models import Comment, Function, Position, Span @@ -34,7 +34,7 @@ class FunctionSchema(Schema): span = fields.Nested(SpanSchema) class Meta: - unknown = EXCLUDE + unknown = INCLUDE param_count = fields.Integer( default = 0, @@ -65,7 +65,6 @@ class Meta: parent_structure_name = fields.String(allow_none=True) parent_structure_type = fields.String(allow_none=True) - file_name = fields.String(allow_none=True) global_variable_writes = fields.Dict( keys = fields.String(), diff --git a/parser/parser/service.py b/parser/parser/service.py index 5124016..5dbdbb3 100644 --- a/parser/parser/service.py +++ b/parser/parser/service.py @@ -2,7 +2,6 @@ from nameko.dependency_providers import Config from nameko.rpc import rpc -from nameko.testing.services import worker_factory from . import utilities from .languages import get_languages @@ -83,5 +82,4 @@ def get_functions_with_properties(self, name, contents): functions = function_list - return functions From 7568774b311381431d245ee72759562ce70be48f Mon Sep 17 00:00:00 2001 From: Brandon Date: Sat, 19 Jun 2021 00:47:17 -0400 Subject: [PATCH 4/4] Make parser service changes backwards compatible The original additions to the function schema and function model made it so that other metrics using the parser function schema were not compatible with the newly added fields. Without an extra schema, we would have to modify the function schemas of other metrics that rely on the parser service. Other changes include removing the functionality that parses macro calls in C due to the limitations of srcml, and doing away with the FunctionCollector class and moving the functions from the FunctionCollector class into the SrcMlParser class. It didn't make sense keeping two separate classes that serve a similar purpose when all it takes is one class to manage a single state. --- parser/parser/models.py | 57 +++++ parser/parser/parsers/srcmlparser.py | 353 +++++++++++++-------------- parser/parser/schemas.py | 55 +++-- parser/parser/service.py | 4 +- 4 files changed, 258 insertions(+), 211 deletions(-) diff --git a/parser/parser/models.py b/parser/parser/models.py index 86df356..846eff9 100644 --- a/parser/parser/models.py +++ b/parser/parser/models.py @@ -1,3 +1,4 @@ +from __future__ import annotations import dataclasses @dataclasses.dataclass(frozen=True) @@ -30,3 +31,59 @@ class Function: signature: str span: Span + +@dataclasses.dataclass(frozen=True) +class GlobalVariableWrite: + __slots__ = [ + 'expressions', + 'members_modified', + 'indices_modified'] + + expressions: list(str) + members_modified: list(str) + indices_modified: list(str) + +@dataclasses.dataclass(frozen=True) +class AcyclicalPath: + __slots__ = [ + 'type', + 'children', + 'if_type' + ] + + type: str + children: list(AcyclicalPath) + if_type: str + + def __init__( + self, + type: str, + children: list(AcyclicalPath), + if_type: str = None): + object.__setattr__(self, 'type', type) + object.__setattr__(self, 'children', children) + object.__setattr__(self, 'if_type', if_type) + +@dataclasses.dataclass(frozen=True) +class FunctionProperties(Function): + __slots__ = [ + 'function_name', + 'calls', + 'callers', + 'acyclical_paths_tree', + 'has_return', + 'parent_structure_name', + 'parent_structure_type', + 'global_variable_writes', + 'global_variable_reads' + ] + + function_name: str + calls: list(str) + callers: list(str) + acyclical_paths_tree: list(AcyclicalPath) + has_return: bool + parent_structure_name: str + parent_structure_type: str + global_variable_writes: list(GlobalVariableWrite) + global_variable_reads: list(str) diff --git a/parser/parser/parsers/srcmlparser.py b/parser/parser/parsers/srcmlparser.py index 327241f..21ba589 100644 --- a/parser/parser/parsers/srcmlparser.py +++ b/parser/parser/parsers/srcmlparser.py @@ -5,7 +5,8 @@ from xml.etree import ElementTree from ..enumerations import CommentType -from ..models import Comment, Function, Position, Span +from ..models import Comment, Function, Position, Span, \ + AcyclicalPath, GlobalVariableWrite, FunctionProperties from ..constants import _get_constants_from_language logger = logging.getLogger(__name__) @@ -83,14 +84,12 @@ def _get_name(element): return name def _get_span(element): - if element is not None: - position = element.attrib[f'{{{POS_NS}}}start'] - begin = (int(i) for i in position.split(':')) - position = element.attrib[f'{{{POS_NS}}}end'] - end = (int(i) for i in position.split(':')) + position = element.attrib[f'{{{POS_NS}}}start'] + begin = (int(i) for i in position.split(':')) + position = element.attrib[f'{{{POS_NS}}}end'] + end = (int(i) for i in position.split(':')) - return begin, end - return (-1, -1), (-1, -1) + return begin, end def _get_signature(element): def _join(values, delimiter=' '): @@ -258,37 +257,6 @@ def _parse_function_call(element): return None -def _parse_macro_call(element, language): - macro_calls = {} - if element.tag == f'{{{SRC_NS}}}macro': - macro_arg_list = element.find(f'{{{SRC_NS}}}argument_list') - macro_args = (macro_arg_list.findall(f'{{{SRC_NS}}}argument') - if macro_arg_list is not None - else None) - - if macro_args is not None: - for arg in macro_args: - arg_text = (arg.text - if arg is not None and arg.text is not None else '') - - if arg_text != '': - srcml = _get_srcml(arg_text, language) - rootet = ElementTree.fromstring(srcml) - - if re.search(r'{(.)+}', - arg_text, - flags=re.MULTILINE|re.DOTALL): - for child in rootet.iter(): - call = _parse_function_call(child) - - if call is not None: - macro_calls = {**macro_calls, **call} - - if macro_calls != {}: - return macro_calls - - return None - def _parse_declaration( element, parent_struct_name = '', @@ -402,6 +370,42 @@ def __init__(self, language): self.RESERVED_STREAMS, self.RESERVED_KEYWORDS) = _get_constants_from_language(language) + +class SrcMLParser: + def __init__(self, language): + self._language = language + (self.RESERVED_FUNCTIONS, + self.RESERVED_STREAMS, + self.RESERVED_KEYWORDS) = _get_constants_from_language(language) + + def get_comments(self, name, contents): + comments = None + + srcml = _get_srcml(contents, self._language) + if srcml is None: + logger.error('SrcML failed to parse %s', name) + else: + srcml = ElementTree.fromstring(srcml) + comments = list(_get_comments(srcml)) + + return comments + + def get_functions(self, name, contents): + functions = None + + lines = NEWLINE_RE.split(contents) + nlines = len(lines[:-1] if lines[-1] == '' else lines) + srcml = _get_srcml(contents, self._language) + if srcml is None: + logger.error('SrcML failed to parse %s', name) + else: + functions = list() + srcml = ElementTree.fromstring(srcml) + functions.extend(_get_declarations(srcml)) + functions.extend(_get_definitions(srcml, nlines)) + + return functions + def _parse_el_for_global_variable_write( self, element, @@ -433,7 +437,11 @@ def _parse_el_for_global_variable_write( if incr_decr_op is not None and incr_decr_op.text is not None else '') - incr_decr_op_pos = tuple(_get_span(incr_decr_op)[0]) + incr_decr_op_pos = (-1, -1) + + if incr_decr_op is not None: + incr_decr_op_pos = tuple(_get_span(incr_decr_op)[0]) + incr_decr_op_row = int(incr_decr_op_pos[0]) incr_decr_op_col = int(incr_decr_op_pos[1]) @@ -448,19 +456,19 @@ def _parse_el_for_global_variable_write( '\\=' }] - last_equals_op_row = last_equals_op_col = first_equals_op_col = -1 last_equals_op_txt = '' - + last_equals_op_pos = first_equals_op_pos = (-1, -1) if len(equals_ops) > 0: last_equals_op_txt = (equals_ops[-1].text if equals_ops[-1].text is not None else '') last_equals_op_pos = tuple(_get_span(equals_ops[-1])[0]) - last_equals_op_row = int(last_equals_op_pos[0]) - last_equals_op_col = int(last_equals_op_pos[1]) - first_equals_op_pos = tuple(_get_span(equals_ops[0])[0]) - first_equals_op_col = int(first_equals_op_pos[1]) + + last_equals_op_row = int(last_equals_op_pos[0]) + last_equals_op_col = int(last_equals_op_pos[1]) + + first_equals_op_col = int(first_equals_op_pos[1]) if last_equals_op_txt != '' or incr_decr_op_txt != '': if len(expr_names) > 0: @@ -476,18 +484,23 @@ def _parse_el_for_global_variable_write( expr_sub_name = (_get_name_from_nested_name( expr_sub_names[0]) - if len(expr_sub_names) > 1 else name) - expr_sub_name_pos = tuple(_get_span(expr_sub_name)[0]) + expr_sub_name_pos = (-1, -1) + + if expr_sub_name is not None: + expr_sub_name_pos = tuple(_get_span(expr_sub_name)[0]) expr_sub_name_pos_row = int(expr_sub_name_pos[0]) expr_sub_name_pos_col = int(expr_sub_name_pos[1]) expr_index = name.find(f'{{{SRC_NS}}}index') + expr_index_pos = (-1 ,-1) + + if expr_index is not None: + expr_index_pos = tuple(_get_span(expr_index)[0]) - expr_index_pos = tuple(_get_span(expr_index)[0]) expr_index_pos_row = int(expr_index_pos[0]) expr_index_pos_col = int(expr_index_pos[1]) @@ -507,8 +520,12 @@ def _parse_el_for_global_variable_write( if op is not None and op.text is not None and (op.text == '->' or op.text == '.')), None) - access_op_pos = tuple( - _get_span(access_op)[0]) + access_op_pos = (-1, -1) + + if access_op is not None: + access_op_pos = tuple( + _get_span(access_op)[0]) + access_op_pos_row = int(access_op_pos[0]) access_op_pos_col = int(access_op_pos[1]) @@ -529,7 +546,11 @@ def _parse_el_for_global_variable_write( member_accessed_str = '' for child in expr_children: - child_pos = tuple(_get_span(child)[0]) + child_pos = (-1, -1) + + if child_pos is not None: + child_pos = tuple(_get_span(child)[0]) + child_pos_row = int(child_pos[0]) child_pos_col = int(child_pos[1]) @@ -579,48 +600,44 @@ def _parse_el_for_global_variable_write( }) for cand in fan_out_var_candidates: - if ( - last_equals_op_txt != '' and - last_equals_op_col > cand['col_pos'] and - last_equals_op_row == cand['row_pos']): - if cand['name'] not in variable_writes.keys(): - variable_writes[cand['name']] = { - 'expressions': cand['expr_mod_statements'], - 'members_modified': cand['members_accessed'], - 'indices_modified': cand['indices'] - } - else: - variable_writes[cand['name']]['expressions'] = [ - *variable_writes[cand['name']]['expressions'], - *cand['expr_mod_statements']] - - variable_writes[cand['name']]['members_modified'] = [ - *variable_writes[cand['name']]['members_modified'], - *cand['members_accessed']] - - variable_writes[cand['name']]['indices_modified'] = [ - *variable_writes[cand['name']]['indices_modified'], - *cand['indices']] - - elif incr_decr_op_txt and incr_decr_op_row == cand['row_pos']: + if (( + last_equals_op_txt != '' and + last_equals_op_col > cand['col_pos'] and + last_equals_op_row == cand['row_pos'] + ) or ( + incr_decr_op_txt and + incr_decr_op_row == cand['row_pos'] + )): if cand['name'] not in variable_writes.keys(): - variable_writes[cand['name']] = { - 'expressions': cand['expr_mod_statements'], - 'members_modified': cand['members_accessed'], - 'indices_modified': cand['indices'] - } + variable_writes[cand['name']] = GlobalVariableWrite( + expressions = cand['expr_mod_statements'], + members_modified = cand['members_accessed'], + indices_modified = cand['indices'] + ) else: - variable_writes[cand['name']]['expressions'] = [ - *variable_writes[cand['name']]['expressions'], - *cand['expr_mod_statements']] - - variable_writes[cand['name']]['members_modified'] = [ - *variable_writes[cand['name']]['members_modified'], - *cand['members_accessed']] - - variable_writes[cand['name']]['indices_modified'] = [ - *variable_writes[cand['name']]['indices_modified'], - *cand['indices']] + object.__setattr__( + variable_writes[cand['name']], + 'expressions', + [ + *variable_writes[cand['name']].expressions, + *cand['expr_mod_statements'] + ]) + + object.__setattr__( + variable_writes[cand['name']], + 'members_modified', + [ + *variable_writes[cand['name']].members_modified, + *cand['members_accessed'] + ]) + + object.__setattr__( + variable_writes[cand['name']], + 'indices_modified', + [ + *variable_writes[cand['name']].indices_modified, + *cand['indices'] + ]) def _parse_el_for_global_variable_read( self, @@ -661,10 +678,17 @@ def _parse_el_for_global_variable_read( op.text is not None and op.text in {'++', '--'}), None) - incr_decr_op_pos = tuple(_get_span(incr_decr_op)[0]) + incr_decr_op_pos = (-1, -1) + if incr_decr_op is not None: + incr_decr_op_pos = tuple(_get_span(incr_decr_op)[0]) + incr_decr_op_col = int(incr_decr_op_pos[1]) - equal_op_pos = tuple(_get_span(last_op)[0]) + equal_op_pos = (-1, -1) + + if last_op is not None: + equal_op_pos = tuple(_get_span(last_op)[0]) + equal_op_pos_col = int(equal_op_pos[1]) for arg in call_arg_names: @@ -695,8 +719,11 @@ def _parse_el_for_global_variable_read( for name in expr_names: name_txt = _get_full_name_text_from_name(name) + name_pos = (-1, -1) + + if name is not None: + name_pos = tuple(_get_span(name)[0]) - name_pos = tuple(_get_span(name)[0]) name_pos_col = int(name_pos[1]) name_accessed_txt = re.split(r'\-\>|\[|\.', name_txt, 1)[0] @@ -754,43 +781,38 @@ def _compile_acyclical_paths_tree(self, root): if_type = (child.attrib['type'] if 'type' in child.attrib.keys() else '') - root_paths.append({ - 'type': child.tag, - 'if_type': if_type, - 'children': self._compile_acyclical_paths_tree(child) - }) + root_paths.append(AcyclicalPath( + type = child.tag, + if_type = if_type, + children = self._compile_acyclical_paths_tree(child))) elif child.tag in { f'{{{SRC_NS}}}for', f'{{{SRC_NS}}}while', f'{{{SRC_NS}}}do' }: - root_paths.append({ - 'type': child.tag, - 'children': self._compile_acyclical_paths_tree(child) - }) + root_paths.append(AcyclicalPath( + type = child.tag, + if_type = None, + children = self._compile_acyclical_paths_tree(child))) elif child.tag == f'{{{SRC_NS}}}switch': - root_paths.append({ - 'type': child.tag, - 'children': self._compile_acyclical_paths_tree(child) - }) + root_paths.append(AcyclicalPath( + type = child.tag, + children = self._compile_acyclical_paths_tree(child))) elif child.tag in { f'{{{SRC_NS}}}case', f'{{{SRC_NS}}}default' }: - root_paths.append({ - 'type': child.tag, - 'children': self._compile_acyclical_paths_tree(child) - }) + root_paths.append(AcyclicalPath( + type = child.tag, + children = self._compile_acyclical_paths_tree(child))) elif child.tag == f'{{{SRC_NS}}}ternary': - root_paths.append({ - 'type': child.tag, - 'children': self._compile_acyclical_paths_tree(child) - }) + root_paths.append(AcyclicalPath( + type = child.tag, + children = self._compile_acyclical_paths_tree(child))) elif child.tag == f'{{{SRC_NS}}}then': - root_paths.append({ - 'type': child.tag, - 'children': [] - }) + root_paths.append(AcyclicalPath( + type = child.tag, + children = [])) return root_paths @@ -804,8 +826,8 @@ def get_function_properties( parent_declarations, file_name, enums, - local_function_names, - language): + local_function_names + ): if function_element.tag in { f'{{{SRC_NS}}}function', @@ -813,6 +835,7 @@ def get_function_properties( }: func_sig =_get_signature(function_element) func_name = _get_name(function_element) + func_span = _get_span(function_element) block = function_element.find(f'{{{SRC_NS}}}block') has_return_value = False @@ -824,14 +847,12 @@ def get_function_properties( pointer_decls = [] calls = {} - macro_calls = {} global_variable_writes = {} global_variable_reads = [] if block is not None: param_data = _get_param_data(function_element) - param_count = len(param_data['parameters']) for func_child in function_element.iter(): decl = _parse_declaration( @@ -841,7 +862,6 @@ def get_function_properties( belongs_to_file=file_name) call = _parse_function_call(func_child) - macros = _parse_macro_call(func_child, language) throws = _get_throws_expression_names(func_child) if throws != []: @@ -861,9 +881,6 @@ def get_function_properties( *all_local_call_names, *call.keys()] - if macros is not None: - macro_calls = {**macro_calls, **macros} - if f'{{{SRC_NS}}}return' and has_return_value is False: return_expr = func_child.find(f'{{{SRC_NS}}}expr') if return_expr is not None: @@ -896,20 +913,20 @@ def get_function_properties( if func_sig not in function_dict.keys(): local_function_names.append(func_name) - function_dict[func_sig] = { - 'signature': func_sig, - 'function_name': func_name, - 'param_count': param_count, - 'calls': calls, - 'functions_called_by': [], - 'acyclical_paths_tree': acyc_paths, - 'has_return': has_return_value, - 'parent_structure_name': parent_struct_name, - 'parent_structure_type': parent_struct_type, - 'global_variable_writes': global_variable_writes, - 'global_variable_reads': list( + function_dict[func_sig] = FunctionProperties( + signature = func_sig, + span = func_span, + function_name = func_name, + calls = calls, + callers = [], + acyclical_paths_tree = acyc_paths, + has_return = has_return_value, + parent_structure_name = parent_struct_name, + parent_structure_type = parent_struct_type, + global_variable_writes = global_variable_writes, + global_variable_reads = list( set(global_variable_reads)) - } + ) return function_dict @@ -921,8 +938,7 @@ def get_functions_with_metric_properties( parent_declarations, file_name, enums, - all_local_call_names, - language + all_local_call_names ): parent_name_txt = parent_struct_name local_declarations = parent_declarations @@ -989,8 +1005,7 @@ def get_functions_with_metric_properties( parent_struct_type = new_parent_struct_type, parent_declarations = local_declarations, file_name = file_name, - enums = enums, - language = language)} + enums = enums)} if child.tag in { f'{{{SRC_NS}}}block', @@ -1004,8 +1019,7 @@ def get_functions_with_metric_properties( parent_struct_type = parent_struct_type, parent_declarations = local_declarations, file_name = file_name, - enums = enums, - language = language)} + enums = enums)} if child.tag in { f'{{{SRC_NS}}}function', @@ -1020,55 +1034,19 @@ def get_functions_with_metric_properties( parent_declarations = parent_declarations, file_name = file_name, local_function_names=[ - f['function_name'] + f.function_name for f in function_dict.values()], - enums = enums, - language = language) + enums = enums) function_dict = {**function_dict, **updated_function_dict} return function_dict -class SrcMLParser: - def __init__(self, language): - self._language = language - - def get_comments(self, name, contents): - comments = None - - srcml = _get_srcml(contents, self._language) - if srcml is None: - logger.error('SrcML failed to parse %s', name) - else: - srcml = ElementTree.fromstring(srcml) - comments = list(_get_comments(srcml)) - - return comments - - def get_functions(self, name, contents): - functions = None - - lines = NEWLINE_RE.split(contents) - nlines = len(lines[:-1] if lines[-1] == '' else lines) - srcml = _get_srcml(contents, self._language) - if srcml is None: - logger.error('SrcML failed to parse %s', name) - else: - functions = list() - srcml = ElementTree.fromstring(srcml) - functions.extend(_get_declarations(srcml)) - functions.extend(_get_definitions(srcml, nlines)) - - return functions - def get_functions_with_properties(self, file_name, contents): srcml = _get_srcml(contents, self._language) if srcml is None: - logger.debug('Srcml parser is none') return None - else: - logger.debug('Successfully retrieved srcml parser') root = ElementTree.fromstring(srcml) @@ -1085,15 +1063,12 @@ def get_functions_with_properties(self, file_name, contents): for el in root.findall(f'{{{SRC_NS}}}enum') ] - func_collect = FunctionCollector(language=self._language) - - func_dict = func_collect.get_functions_with_metric_properties( + func_dict = self.get_functions_with_metric_properties( root_element = root, parent_struct_name = file_name, parent_struct_type= 'file', parent_declarations=root_declarations, all_local_call_names=[], - language=self._language, enums = root_enums, file_name = file_name ) diff --git a/parser/parser/schemas.py b/parser/parser/schemas.py index 2dbec11..ab0a3df 100644 --- a/parser/parser/schemas.py +++ b/parser/parser/schemas.py @@ -1,7 +1,8 @@ -from marshmallow import Schema, fields, post_load, INCLUDE - -from .models import Comment, Function, Position, Span +from marshmallow import Schema, fields, post_load +from marshmallow.utils import INCLUDE +from .models import Comment, Function, FunctionProperties, \ + Position, Span, AcyclicalPath class PositionSchema(Schema): line = fields.Integer() @@ -29,32 +30,54 @@ class CommentSchema(Schema): def make_comment(self, data, **kwargs): return Comment(**data) +class AcyclicalPathSchema(Schema): + type = fields.String() + if_type = fields.String(allow_none=True) + children = fields.List( + fields.Nested( + lambda: AcyclicalPathSchema() + ) + ) + + @post_load + def make_acyclical_path(self, data, **kwargs): + return AcyclicalPath(**data) + +class GlobalVariableWrite(Schema): + expressions = fields.List(fields.String) + members_modified = fields.List(fields.String) + indices_modified = fields.List(fields.String) + + @post_load + def make_global_variable_write(self, data, **kwargs): + return GlobalVariableWrite(**data) + class FunctionSchema(Schema): signature = fields.String() span = fields.Nested(SpanSchema) + @post_load + def make_function(self, data, **kwargs): + return Function(**data) + +class FunctionPropertiesSchema(FunctionSchema): class Meta: unknown = INCLUDE - param_count = fields.Integer( - default = 0, - allow_none = True - ) - calls = fields.List( fields.String(), default = [], allow_none=True ) - functions_called_by = fields.List( + callers = fields.List( fields.String(), default = [], allow_none=True ) acyclical_paths_tree = fields.List( - fields.Dict(), + fields.Nested(AcyclicalPathSchema), allow_none=True ) @@ -68,14 +91,7 @@ class Meta: global_variable_writes = fields.Dict( keys = fields.String(), - values = fields.Dict( - keys = fields.String(), - values = fields.List( - fields.String, - default = [], - allow_none = True - ) - ), + values = fields.Nested(GlobalVariableWrite), allow_none = True ) @@ -84,7 +100,6 @@ class Meta: allow_none=True ) - @post_load def make_function(self, data, **kwargs): - return Function(**data) + return FunctionProperties(**data) diff --git a/parser/parser/service.py b/parser/parser/service.py index 5dbdbb3..8562ba8 100644 --- a/parser/parser/service.py +++ b/parser/parser/service.py @@ -6,7 +6,7 @@ from . import utilities from .languages import get_languages from .parsers import get_parser -from .schemas import CommentSchema, FunctionSchema +from .schemas import CommentSchema, FunctionSchema, FunctionPropertiesSchema logger = logging.getLogger(__name__) @@ -76,7 +76,7 @@ def get_functions_with_properties(self, name, contents): if functions is not None and functions != {}: for key in functions.keys(): function_list.append( - FunctionSchema(many=False) + FunctionPropertiesSchema(many=False) .dump(functions[key]) )