@@ -216,7 +216,8 @@ def __init__(
216216 if global_info is not None :
217217 self .set_global_info (global_info )
218218 if data_file is not None :
219- self .set_data_file (data_file , skip_checksum = skip_checksum , map_readonly = map_readonly )
219+ offset = self ._get_ncd_offset ()
220+ self .set_data_file (data_file , skip_checksum = skip_checksum , map_readonly = map_readonly , offset = offset )
220221
221222 def __len__ (self ):
222223 return self ._memmap .shape [0 ]
@@ -392,6 +393,30 @@ def _is_conforming_dataset(self):
392393 # if we get here, the file exists and is conforming
393394 return True
394395
396+ def _get_ncd_offset (self ):
397+ """
398+ Detect Non-Conforming Dataset files and return the appropriate header offset.
399+
400+ For NCD files that reference external non-SigMF files (e.g., WAV), the
401+ core:header_bytes field indicates how many bytes to skip to reach the
402+ actual sample data.
403+
404+ Returns
405+ -------
406+ int
407+ Byte offset to apply when reading the dataset file. 0 for conforming datasets.
408+ """
409+ if self ._is_conforming_dataset ():
410+ return 0
411+
412+ # check if this is an NCD with core:dataset and header_bytes
413+ captures = self .get_captures ()
414+ dataset_field = self .get_global_field (self .DATASET_KEY )
415+ if dataset_field and captures and self .HEADER_BYTES_KEY in captures [0 ]:
416+ return captures [0 ][self .HEADER_BYTES_KEY ]
417+
418+ return 0
419+
395420 def get_schema (self ):
396421 """
397422 Return a schema object valid for the current metadata
0 commit comments