diff --git a/src/haddock/libs/libalign.py b/src/haddock/libs/libalign.py index a42cd5344..c2c1983c4 100644 --- a/src/haddock/libs/libalign.py +++ b/src/haddock/libs/libalign.py @@ -11,6 +11,7 @@ * :py:func:`pdb2fastadic` * :py:func:`get_atoms` * :py:func:`get_align` +* :pt:func:`align_custom` * :py:func:`align_struct` * :py:func:`align_seq` * :py:func:`make_range` @@ -770,7 +771,7 @@ def pdb2fastadic(pdb_f: PDBPath) -> dict[str, dict[int, str]]: def get_align( - method: str, lovoalign_exec: FilePath + method: str, lovoalign_exec: FilePath, align_fname: Optional[FilePath] = None ) -> partial[dict[str, dict[int, int]]]: """ Get the alignment function. @@ -783,15 +784,25 @@ def get_align( lovoalign_exec : str Path to the lovoalign executable. + align_fname : str + Path to the custom alignment file, optional + Returns ------- align_func : functools.partial desired alignment function """ - if method == "structure": + if align_fname: + if not Path(align_fname).exists(): + raise FileNotFoundError(f"Custom alignment file {align_fname} not found") + align_func = partial(align_custom, custom_alig_file = Path(align_fname)) + + elif method == "structure": align_func = partial(align_strct, lovoalign_exec=lovoalign_exec) + elif method == "sequence": align_func = partial(align_seq) + else: available_alns = ("sequence", "structure") raise ValueError( @@ -801,6 +812,73 @@ def get_align( return align_func +def align_custom( + reference: PDBFile, + model: PDBFile, + output_path: FilePath, + custom_alig_file: FilePath, +) -> tuple[dict[str, dict[int, int]], dict[str, str]]: + """ + Parse custom alignment file to extract numbering relationship. + + Parameters + ---------- + reference : :py:class:`haddock.libs.libontology.PDBFile` + + model : :py:class:`haddock.libs.libontology.PDBFile` + + output_path : Path + + custom_alig_file : Path + Path to the custom .izone alignment file + + Returns + ------- + numbering_dic : dict + dictionary of residue mapping (one per chain) + + model2ref_chain_dict : dict + dictionary of chain mapping + + """ + numbering_dic: dict[str, dict[int, int]] = {} + model2ref_chain_dict: dict[str, str] = {} + + # read align file, skip everything that is not 'ZONE' + with open(custom_alig_file, 'r') as f: + zone_lines = (line.strip() for line in f if line.strip().startswith('ZONE')) + + for line in zone_lines: + if len(line.split()) <2: + log.warning(f"Unexpected {line} in {custom_alig_file}, skipping") + continue + + mapping = line.split()[1] + if ':' not in mapping: + log.warning(f"Missing colon in mapping: {line}") + continue + + # constructs dictionaries + try: + model_side, ref_side = mapping.split(':') + model_chain = model_side[0] + model_resid = int(model_side[1:]) + ref_chain = ref_side[0] + ref_resid = int(ref_side[1:]) + + if ref_chain not in numbering_dic: + numbering_dic[ref_chain] = {} + + numbering_dic[ref_chain][model_resid] = ref_resid + model2ref_chain_dict[model_chain] = ref_chain + + except (ValueError, IndexError) as e: + log.warning(f"Could not parse line: {line} ({e})") + continue + + return numbering_dic, model2ref_chain_dict + + def align_strct( reference: PDBFile, model: PDBFile, diff --git a/src/haddock/modules/analysis/caprieval/capri.py b/src/haddock/modules/analysis/caprieval/capri.py index 5d16c6927..b9c9b9619 100644 --- a/src/haddock/modules/analysis/caprieval/capri.py +++ b/src/haddock/modules/analysis/caprieval/capri.py @@ -612,6 +612,7 @@ def run(self) -> Union[None, "CAPRI"]: align_func = get_align( method=self.params["alignment_method"], lovoalign_exec=self.params["lovoalign_exec"], + align_fname=self.params["align_fname"], ) self.model2ref_numbering, self.model2ref_chain_dict = align_func( self.reference, self.model, self.path diff --git a/src/haddock/modules/analysis/caprieval/defaults.yaml b/src/haddock/modules/analysis/caprieval/defaults.yaml index 62821a00e..dc62d2d18 100644 --- a/src/haddock/modules/analysis/caprieval/defaults.yaml +++ b/src/haddock/modules/analysis/caprieval/defaults.yaml @@ -151,6 +151,19 @@ sort_ascending: group: analysis explevel: easy +align_fname: + default: '' + type: file + title: Alignment filename + short: Filename of the file containing alignment information, izone format. + long: Alignment file in izone format containing the mapping between residues. E.g. "ZONE A1:A5" means that + residue 1 of chain A of the model corresponds to residue 5 of chain A or the reference. + Note that if this parameter is provided, the 'alignment_method' parameter is ignored. + Note that all caprieval metrics will be calculated using exclusively the chains and residues f srom izone file. + Note that there is no prior consistency verification between izone and pdb files - incorrect izone may crash caprieval. + group: analysis + explevel: expert + alignment_method: default: sequence type: string diff --git a/tests/golden_data/protdna_complex.izone b/tests/golden_data/protdna_complex.izone new file mode 100644 index 000000000..d5fa76772 --- /dev/null +++ b/tests/golden_data/protdna_complex.izone @@ -0,0 +1,88 @@ +# protein +ZONE A-1:A-1 +ZONE A0:A0 +ZONE A1:A1 +ZONE A2:A2 +ZONE A3:A3 +ZONE A4:A4 +ZONE A5:A5 +ZONE A6:A6 +ZONE A7:A7 +ZONE A8:A8 +ZONE A9:A9 +ZONE A10:A10 +ZONE A11:A11 +ZONE A12:A12 +ZONE A13:A13 +ZONE A14:A14 +ZONE A15:A15 +ZONE A16:A16 +ZONE A17:A17 +ZONE A18:A18 +ZONE A19:A19 +ZONE A20:A20 +ZONE A21:A21 +ZONE A22:A22 +ZONE A23:A23 +ZONE A24:A24 +ZONE A25:A25 +ZONE A26:A26 +ZONE A27:A27 +ZONE A28:A28 +ZONE A29:A29 +ZONE A30:A30 +ZONE A31:A31 +ZONE A32:A32 +ZONE A33:A33 +ZONE A34:A34 +ZONE A35:A35 +ZONE A36:A36 +ZONE A37:A37 +ZONE A38:A38 +ZONE A39:A39 +ZONE A40:A40 +ZONE A41:A41 +ZONE A42:A42 +ZONE A43:A43 +ZONE A44:A44 +ZONE A45:A45 +ZONE A46:A46 +ZONE A47:A47 +ZONE A48:A48 +ZONE A49:A49 +ZONE A50:A50 +ZONE A51:A51 +ZONE A52:A52 +ZONE A53:A53 +ZONE A54:A54 +ZONE A55:A55 +ZONE A56:A56 +ZONE A57:A57 +ZONE A58:A58 +ZONE A59:A59 +ZONE A60:A60 +ZONE A61:A61 +ZONE A62:A62 +# DNA +ZONE B1:B1 +ZONE B2:B2 +ZONE B3:B3 +ZONE B4:B4 +ZONE B5:B5 +ZONE B6:B6 +ZONE B7:B7 +ZONE B8:B8 +ZONE B9:B9 +ZONE B10:B10 +ZONE B11:B11 +ZONE B28:B28 +ZONE B29:B29 +ZONE B30:B30 +ZONE B31:B31 +ZONE B32:B32 +ZONE B33:B33 +ZONE B34:B34 +ZONE B35:B35 +ZONE B36:B36 +ZONE B37:B37 +ZONE B38:B38 diff --git a/tests/test_libalign.py b/tests/test_libalign.py index 343ae66eb..0a480124f 100644 --- a/tests/test_libalign.py +++ b/tests/test_libalign.py @@ -22,6 +22,7 @@ make_range, pdb2fastadic, rearrange_xyz_files, + align_custom, ) from . import golden_data @@ -567,3 +568,47 @@ def test_check_chains(): assert obs_r_chain == exp_r_chain assert obs_l_chain == exp_l_chain +### new +def test_align_custom_basic(): + """Test basic custom alignment file parsing. + Use protdna_complex_1.pdb and protdna_complex_2.pdb and a 1:1 + custom alignment file protdna_complex.izone + """ + ref = Path(golden_data, "protdna_complex_1.pdb") + mod = Path(golden_data, "protdna_complex_2.pdb") + custom_izone = Path(golden_data, "protdna_complex.izone") + + observed_numb_dic, observed_chm_dict = align_custom( + reference=ref, + model=mod, + output_path=golden_data, + custom_alig_file=custom_izone + ) + + expected_chm_dict = {"A": "A", "B": "B"} + + assert isinstance(observed_numb_dic, dict) + assert "A" in observed_numb_dic + assert "B" in observed_numb_dic + assert isinstance(observed_numb_dic["A"], dict) + assert isinstance(observed_numb_dic["B"], dict) + # protein has 64 residues, each should be in mapping + assert len(observed_numb_dic["A"]) == 64 + # same for dna + assert len(observed_numb_dic["B"]) == 22 + # several random residues + assert observed_numb_dic["A"][-1] == -1 + assert observed_numb_dic["B"][38] == 38 + assert isinstance(observed_chm_dict, dict) + assert observed_chm_dict == expected_chm_dict + + # all mapped residues are integers + for chain, mappings in observed_numb_dic.items(): + for model_res, ref_res in mappings.items(): + assert isinstance(model_res, int) + assert isinstance(ref_res, int) + + # all mapped chains are strings + for model_chain, ref_chain in observed_chm_dict.items(): + assert isinstance(model_chain, str) + assert isinstance(ref_chain, str) \ No newline at end of file diff --git a/tests/test_module_caprieval.py b/tests/test_module_caprieval.py index 12da92c79..3f9e240c5 100644 --- a/tests/test_module_caprieval.py +++ b/tests/test_module_caprieval.py @@ -855,6 +855,7 @@ def test_capri_run(mocker, monkeypatch): "fnat_cutoff": 10, "irmsd_cutoff": 10, "keep_hetatm": False, + "align_fname": None, }, ) rand_fnat = random.random()