From 4f53b31be300af957b7bf9cbc38dc5be28433427 Mon Sep 17 00:00:00 2001 From: Lansus Date: Tue, 8 Apr 2025 16:27:43 +0800 Subject: [PATCH 1/7] refactor: optimize getFieldParser and createRecordParser methods. --- pystdf/IO.py | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/pystdf/IO.py b/pystdf/IO.py index 484b514..cc1a19a 100644 --- a/pystdf/IO.py +++ b/pystdf/IO.py @@ -27,18 +27,10 @@ from pystdf.Pipeline import DataSource -def appendFieldParser(fn, action): - """Append a field parsing function to a record parsing function. - This is used to build record parsing functions based on the record type specification.""" - def newRecordParser(*args): - fields = fn(*args) - try: - fields.append(action(*args)) - except EndOfRecordException: pass - return fields - return newRecordParser - class Parser(DataSource): + _k_field_pattern = re.compile('k(\d+)([A-Z][a-z0-9]+)') + _cached_field_parsers = {} + def readAndUnpack(self, header, fmt): size = struct.calcsize(fmt) if (size > header.len): @@ -193,17 +185,26 @@ def parse(self, count=0): raise def getFieldParser(self, fieldType): - if (fieldType.startswith("k")): - fieldIndex, arrayFmt = re.match('k(\d+)([A-Z][a-z0-9]+)', fieldType).groups() - return lambda self, header, fields: self.readArray(header, fields[int(fieldIndex)], arrayFmt) - else: - parseFn = self.unpackMap[fieldType] - return lambda self, header, fields: parseFn(header, fieldType) + if fieldType not in self._cached_field_parsers: + if (fieldType.startswith("k")): + fieldIndex, arrayFmt = self._k_field_pattern.match(fieldType).groups() + def parse_field(parser, header, fields): + return parser.readArray(header, fields[int(fieldIndex)], arrayFmt) + else: + parseFn = self.unpackMap[fieldType] + def parse_field(parser, header, fields): + return parseFn(header, fieldType) + self._cached_field_parsers[fieldType] = parse_field + return self._cached_field_parsers[fieldType] def createRecordParser(self, recType): - fn = lambda self, header, fields: fields - for stdfType in recType.fieldStdfTypes: - fn = appendFieldParser(fn, self.getFieldParser(stdfType)) + field_parsers = tuple(self.getFieldParser(stdfType) for stdfType in recType.fieldStdfTypes) + def fn(parser, header, fields): + try: + for parse_field in field_parsers: + fields.append(parse_field(parser, header, fields)) + except EndOfRecordException: pass + return fields return fn def __init__(self, recTypes=V4.records, inp=sys.stdin, reopen_fn=None, endian=None): From 70523687af1cfa17287e16bfeccc645fc2fdd794 Mon Sep 17 00:00:00 2001 From: Lansus Date: Tue, 8 Apr 2025 19:54:32 +0800 Subject: [PATCH 2/7] style: Fix field resolver cache initialization position. --- pystdf/IO.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pystdf/IO.py b/pystdf/IO.py index cc1a19a..62aec38 100644 --- a/pystdf/IO.py +++ b/pystdf/IO.py @@ -29,7 +29,6 @@ class Parser(DataSource): _k_field_pattern = re.compile('k(\d+)([A-Z][a-z0-9]+)') - _cached_field_parsers = {} def readAndUnpack(self, header, fmt): size = struct.calcsize(fmt) @@ -214,6 +213,7 @@ def __init__(self, recTypes=V4.records, inp=sys.stdin, reopen_fn=None, endian=No self.inp = inp self.reopen_fn = reopen_fn self.endian = endian + self._cached_field_parsers = {} self.recordMap = dict( [ ( (recType.typ, recType.sub), recType ) From 5eac17293a9897482e72278e3b0fad23243952b0 Mon Sep 17 00:00:00 2001 From: Lansus Date: Tue, 8 Apr 2025 20:17:19 +0800 Subject: [PATCH 3/7] style: fix regular expression quote format. --- pystdf/IO.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pystdf/IO.py b/pystdf/IO.py index 62aec38..589ed82 100644 --- a/pystdf/IO.py +++ b/pystdf/IO.py @@ -28,7 +28,7 @@ from pystdf.Pipeline import DataSource class Parser(DataSource): - _k_field_pattern = re.compile('k(\d+)([A-Z][a-z0-9]+)') + _k_field_pattern = re.compile(r'k(\d+)([A-Z][a-z0-9]+)') def readAndUnpack(self, header, fmt): size = struct.calcsize(fmt) From 369e0a3e005009983a4b753e6fdc544edf438428 Mon Sep 17 00:00:00 2001 From: Lansus Date: Wed, 9 Apr 2025 10:19:51 +0800 Subject: [PATCH 4/7] feat: Add memorize decorator to cache fieldType results. --- pystdf/IO.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/pystdf/IO.py b/pystdf/IO.py index 589ed82..08c4c58 100644 --- a/pystdf/IO.py +++ b/pystdf/IO.py @@ -27,6 +27,13 @@ from pystdf.Pipeline import DataSource +def memorize(func): + """Cache method results in instance's _field_parser_cache dict.""" + def wrapper(self, fieldType): + cache = self.__dict__.setdefault('_field_parser_cache', {}) + return cache.setdefault(fieldType, func(self, fieldType)) + return wrapper + class Parser(DataSource): _k_field_pattern = re.compile(r'k(\d+)([A-Z][a-z0-9]+)') @@ -183,18 +190,19 @@ def parse(self, count=0): self.cancel(exception) raise + @memorize def getFieldParser(self, fieldType): - if fieldType not in self._cached_field_parsers: - if (fieldType.startswith("k")): - fieldIndex, arrayFmt = self._k_field_pattern.match(fieldType).groups() - def parse_field(parser, header, fields): - return parser.readArray(header, fields[int(fieldIndex)], arrayFmt) - else: - parseFn = self.unpackMap[fieldType] - def parse_field(parser, header, fields): + if (fieldType.startswith("k")): + fieldIndex, arrayFmt = self._k_field_pattern.match(fieldType).groups() + def parse_field(parser, header, fields): + return parser.readArray(header, fields[int(fieldIndex)], arrayFmt) + else: + parseFn = self.unpackMap.get(fieldType, None) + if parseFn is None: + raise ValueError("Unknown field type '%s'" % fieldType) + def parse_field(parser, header, fields): return parseFn(header, fieldType) - self._cached_field_parsers[fieldType] = parse_field - return self._cached_field_parsers[fieldType] + return parse_field def createRecordParser(self, recType): field_parsers = tuple(self.getFieldParser(stdfType) for stdfType in recType.fieldStdfTypes) @@ -213,7 +221,6 @@ def __init__(self, recTypes=V4.records, inp=sys.stdin, reopen_fn=None, endian=No self.inp = inp self.reopen_fn = reopen_fn self.endian = endian - self._cached_field_parsers = {} self.recordMap = dict( [ ( (recType.typ, recType.sub), recType ) From 674205398548e09c786241e483a026244b4b71c2 Mon Sep 17 00:00:00 2001 From: Lansus Date: Wed, 9 Apr 2025 10:28:30 +0800 Subject: [PATCH 5/7] fix(IO): simplify field parsing logic. --- pystdf/IO.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pystdf/IO.py b/pystdf/IO.py index 08c4c58..716150b 100644 --- a/pystdf/IO.py +++ b/pystdf/IO.py @@ -197,9 +197,7 @@ def getFieldParser(self, fieldType): def parse_field(parser, header, fields): return parser.readArray(header, fields[int(fieldIndex)], arrayFmt) else: - parseFn = self.unpackMap.get(fieldType, None) - if parseFn is None: - raise ValueError("Unknown field type '%s'" % fieldType) + parseFn = self.unpackMap[fieldType] def parse_field(parser, header, fields): return parseFn(header, fieldType) return parse_field From b4980578282c5022dab6661476f717ae6388b22b Mon Sep 17 00:00:00 2001 From: Lansus Date: Wed, 9 Apr 2025 14:27:20 +0800 Subject: [PATCH 6/7] feat: Add batch reading field function. --- pystdf/IO.py | 101 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 29 deletions(-) diff --git a/pystdf/IO.py b/pystdf/IO.py index 716150b..475f88d 100644 --- a/pystdf/IO.py +++ b/pystdf/IO.py @@ -21,6 +21,7 @@ import struct import re +import itertools from pystdf.Types import * from pystdf import V4 @@ -29,13 +30,28 @@ def memorize(func): """Cache method results in instance's _field_parser_cache dict.""" - def wrapper(self, fieldType): + def wrapper(self, fieldType, count): cache = self.__dict__.setdefault('_field_parser_cache', {}) - return cache.setdefault(fieldType, func(self, fieldType)) + return cache.setdefault((fieldType, count), func(self, fieldType, count)) return wrapper +def groupConsecutiveDuplicates(fieldsList): + """Groups consecutive identical field types and returns them with their counts. + + Examples: + >>> groupConsecutiveDuplicates(['U4', 'U4', 'U1', 'C1', 'C1', 'C1']) + [('U4', 2), ('U1', 1), ('C1', 3)] + >>> groupConsecutiveDuplicates([]) + [] + """ + return ( + [(key, len(list(group))) for key, group in itertools.groupby(fieldsList)] + if fieldsList + else [] + ) + class Parser(DataSource): - _k_field_pattern = re.compile(r'k(\d+)([A-Z][a-z0-9]+)') + _kFieldPattern = re.compile(r'k(\d+)([A-Z][a-z0-9]+)') def readAndUnpack(self, header, fmt): size = struct.calcsize(fmt) @@ -69,6 +85,25 @@ def readField(self, header, stdfFmt): def readFieldDirect(self, stdfFmt): return self.readAndUnpackDirect(packFormatMap[stdfFmt]) + def batchReadFields(self, header, stdfFmt, count): + fmt = packFormatMap[stdfFmt] + size = struct.calcsize(fmt) + total_size = size * count + if (total_size > header.len): + self.inp.read(header.len) + header.len = 0 + raise EndOfRecordException() + buf = self.inp.read(total_size) + if len(buf) == 0: + self.eof = 1 + raise EofException() + header.len -= total_size + vals = struct.unpack(self.endian + fmt * count, buf) + if isinstance(vals[0],bytes): + return tuple(map(lambda val: val.decode("ascii"), vals)) + else: + return vals + def readCn(self, header): if header.len == 0: raise EndOfRecordException() @@ -191,23 +226,35 @@ def parse(self, count=0): raise @memorize - def getFieldParser(self, fieldType): + def getFieldParser(self, fieldType, count): if (fieldType.startswith("k")): - fieldIndex, arrayFmt = self._k_field_pattern.match(fieldType).groups() - def parse_field(parser, header, fields): + fieldIndex, arrayFmt = self._kFieldPattern.match(fieldType).groups() + def parseDynamicArray(parser, header, fields): return parser.readArray(header, fields[int(fieldIndex)], arrayFmt) - else: - parseFn = self.unpackMap[fieldType] - def parse_field(parser, header, fields): - return parseFn(header, fieldType) - return parse_field + return parseDynamicArray + if fieldType in self._unpackMap: + def parseBatchedFields(parser, header, fields): + return parser.batchReadFields(header, fieldType, count) + return parseBatchedFields + parseFn = self.unpackMap[fieldType] + def parseIndividualFields(parser, header, fields): + return [parseFn(header, fieldType) for _ in range(count)] + return parseIndividualFields def createRecordParser(self, recType): - field_parsers = tuple(self.getFieldParser(stdfType) for stdfType in recType.fieldStdfTypes) + grouped_fields = groupConsecutiveDuplicates(recType.fieldStdfTypes) + field_parsers = tuple( + self.getFieldParser(stdfType, count) + for stdfType, count in grouped_fields + ) def fn(parser, header, fields): try: for parse_field in field_parsers: - fields.append(parse_field(parser, header, fields)) + result = parse_field(parser, header, fields) + if isinstance(result, (list, tuple)): + fields.extend(result) + else: + fields.append(result) except EndOfRecordException: pass return fields return fn @@ -224,23 +271,19 @@ def __init__(self, recTypes=V4.records, inp=sys.stdin, reopen_fn=None, endian=No [ ( (recType.typ, recType.sub), recType ) for recType in recTypes ]) + self._unpackMap = { + ftype: self.readField + for ftype in ("C1", "B1", "U1", "U2", "U4", "U8", + "I1", "I2", "I4", "I8", "R4", "R8") + } self.unpackMap = { - "C1": self.readField, - "B1": self.readField, - "U1": self.readField, - "U2": self.readField, - "U4": self.readField, - "U8": self.readField, - "I1": self.readField, - "I2": self.readField, - "I4": self.readField, - "I8": self.readField, - "R4": self.readField, - "R8": self.readField, - "Cn": lambda header, fmt: self.readCn(header), - "Bn": lambda header, fmt: self.readBn(header), - "Dn": lambda header, fmt: self.readDn(header), - "Vn": lambda header, fmt: self.readVn(header) + **self._unpackMap, + **{ + "Cn": lambda header, _: self.readCn(header), + "Bn": lambda header, _: self.readBn(header), + "Dn": lambda header, _: self.readDn(header), + "Vn": lambda header, _: self.readVn(header) + } } self.recordParsers = dict( From 534154ae369ec4c8fc8c122e726b272bf91cc98e Mon Sep 17 00:00:00 2001 From: Lansus Date: Wed, 9 Apr 2025 17:02:45 +0800 Subject: [PATCH 7/7] fix(Parser): fix the logic of reading fields and batch reading fields. --- pystdf/IO.py | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/pystdf/IO.py b/pystdf/IO.py index 475f88d..65c8e9d 100644 --- a/pystdf/IO.py +++ b/pystdf/IO.py @@ -21,7 +21,6 @@ import struct import re -import itertools from pystdf.Types import * from pystdf import V4 @@ -44,6 +43,7 @@ def groupConsecutiveDuplicates(fieldsList): >>> groupConsecutiveDuplicates([]) [] """ + import itertools return ( [(key, len(list(group))) for key, group in itertools.groupby(fieldsList)] if fieldsList @@ -88,19 +88,24 @@ def readFieldDirect(self, stdfFmt): def batchReadFields(self, header, stdfFmt, count): fmt = packFormatMap[stdfFmt] size = struct.calcsize(fmt) - total_size = size * count - if (total_size > header.len): - self.inp.read(header.len) + totalSize = size * count + if (totalSize > header.len): + fullCount = header.len // size + if not fullCount: + header.len = 0 + return (None,) * count + tmpResult = list(self.batchReadFields(header, stdfFmt, fullCount)) header.len = 0 - raise EndOfRecordException() - buf = self.inp.read(total_size) + tmpResult.extend([None] * (count - fullCount)) + return tuple(tmpResult) + buf = self.inp.read(totalSize) if len(buf) == 0: self.eof = 1 raise EofException() - header.len -= total_size + header.len -= totalSize vals = struct.unpack(self.endian + fmt * count, buf) if isinstance(vals[0],bytes): - return tuple(map(lambda val: val.decode("ascii"), vals)) + return tuple(val.decode("ascii") for val in vals) else: return vals @@ -231,27 +236,30 @@ def getFieldParser(self, fieldType, count): fieldIndex, arrayFmt = self._kFieldPattern.match(fieldType).groups() def parseDynamicArray(parser, header, fields): return parser.readArray(header, fields[int(fieldIndex)], arrayFmt) - return parseDynamicArray + return parseDynamicArray, count if fieldType in self._unpackMap: def parseBatchedFields(parser, header, fields): - return parser.batchReadFields(header, fieldType, count) - return parseBatchedFields + result = parser.batchReadFields(header, fieldType, count) + return result + return parseBatchedFields, 1 parseFn = self.unpackMap[fieldType] def parseIndividualFields(parser, header, fields): - return [parseFn(header, fieldType) for _ in range(count)] - return parseIndividualFields + return parseFn(header, fieldType) + return parseIndividualFields, count def createRecordParser(self, recType): - grouped_fields = groupConsecutiveDuplicates(recType.fieldStdfTypes) - field_parsers = tuple( - self.getFieldParser(stdfType, count) - for stdfType, count in grouped_fields - ) + fieldParsers = [] + groupedFields = groupConsecutiveDuplicates(recType.fieldStdfTypes) + for (stdfType, count) in groupedFields: + func, times = self.getFieldParser(stdfType, count) + for _ in range(times): + fieldParsers.append(func) + def fn(parser, header, fields): try: - for parse_field in field_parsers: - result = parse_field(parser, header, fields) - if isinstance(result, (list, tuple)): + for parseField in fieldParsers: + result = parseField(parser, header, fields) + if isinstance(result, tuple): fields.extend(result) else: fields.append(result)