MODELLER A Program for Protein Structure Modeling Release 4 Andrej Sali, RobertoaSnchez and Azat Badretdinov The Rockefeller University 1230 York Avenue New York, NY 10021-6399, USA tel +1-212-327 7550, fax +1-212-327 7540 (or 7974) email sali@rockvax.rockefeller.edu URL http://guitar.rockefeller.edu/ 15 June, 1997 ii Contents Copyright notice xi Acknowledgments xiii 1 Introduction 1 1.1 What is Modeller ? : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 1 1.2 Modeller bibliography : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 2 1.3 Distribution : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 5 1.4 Installation : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 6 1.5 Bug reports : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 11 1.6 Method for comparative protein structure modeling by Modeller : : : : : : : : : : : : : : : : : : : 12 1.7 Comparative protein modeling primer : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 14 1.7.1 Finding structures and sequences related to the target sequence : : : : : : : : : : : : : : : : : 14 1.7.2 Preparing an initial family alignment of all structures and sequences : : : : : : : : : : : : : : 14 1.7.3 Becoming familiar with the family fold and improving the alignment : : : : : : : : : : : : : : 14 1.7.4 Selecting the templates : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 15 1.7.5 Model building : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 15 1.7.6 Evaluating the models : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 15 1.7.7 Repeat the cycle : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 16 1.8 Tutorial on using Modeller for comparative modeling : : : : : : : : : : : : : : : : : : : : : : : : : 17 1.8.1 Preparing input files : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 17 1.8.2 Running Modeller : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 18 1.8.3 Fully automated comparative modeling : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 18 1.9 Modeller evaluation : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 20 1.10 Modeller benchmarks : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 20 1.11 Modeller updates : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 22 2 Modeller commands 23 2.1 Miscellaneous rules and features of Modeller : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 23 2.1.1 Running Modeller scripts : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 23 2.1.2 Controlling breakpoints and the amount of output : : : : : : : : : : : : : : : : : : : : : : : : 23 2.1.3 File naming : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 24 2.1.4 File types : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 25 2.1.5 Format of the command description : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 25 2.2 Stereochemical parameters and molecular topology : : : : : : : : : : : : : : : : : : : : : : : : : : : : 27 iii iv CONTENTS 2.2.1 Modeling residues with non-existing or incomplete entries in the topology and parameter libraries : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 27 2.2.2 READ_TOPOLOGY _ read residue topology library : : : : : : : : : : : : : : : : : : : : : : 28 2.2.3 READ_PARAMETERS _ read parameters library : : : : : : : : : : : : : : : : : : : : : : : : 28 2.2.4 READ_ATOM_CLASS _ read classification of atom types : : : : : : : : : : : : : : : : : : : 28 2.2.5 GENERATE_TOPOLOGY _ generate MODEL topology : : : : : : : : : : : : : : : : : : : : 29 2.2.6 PATCH _ patch MODEL topology : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 30 2.2.7 PATCH_DISULFIDES _ guess MODEL disulfides from templates : : : : : : : : : : : : : : : 31 2.2.8 MUTATE_MODEL _ mutate selected MODEL residues : : : : : : : : : : : : : : : : : : : : : 31 2.2.9 MAKE_TOPOLOGY_MODEL _ make a subset topology library : : : : : : : : : : : : : : : : 33 2.2.10 WRITE_TOPOLOGY_MODEL _ write residue topology library : : : : : : : : : : : : : : : : 33 2.3 Handling of atomic coordinates : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 34 2.3.1 READ_MODEL _ read coordinates for MODEL : : : : : : : : : : : : : : : : : : : : : : : : : 34 2.3.2 READ_MODEL2 _ read coordinates for MODEL2 : : : : : : : : : : : : : : : : : : : : : : : : 34 2.3.3 WRITE_MODEL _ write MODEL : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 35 2.3.4 WRITE_MODEL2 _ write MODEL2 : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 35 2.3.5 BUILD_MODEL _ build MODEL coordinates from topology : : : : : : : : : : : : : : : : : : 36 2.3.6 TRANSFER_XYZ _ copy templates' coordinates to MODEL : : : : : : : : : : : : : : : : : : 37 2.3.7 TRANSFER_RES_NUMB _ residue numbers from MODEL2 to MODEL : : : : : : : : : : : 38 2.3.8 RENAME_SEGMENTS _ rename MODEL segments : : : : : : : : : : : : : : : : : : : : : : 39 2.3.9 PICK_ATOMS _ select atoms in MODEL : : : : : : : : : : : : : : : : : : : : : : : : : : : : 39 2.3.10 PICK_HOT_ATOMS _ pick atoms violating restraints : : : : : : : : : : : : : : : : : : : : : 42 2.3.11 RANDOMIZE_XYZ _ randomize MODEL coordinates : : : : : : : : : : : : : : : : : : : : : 43 2.3.12 IUPAC_MODEL _ standardize certain dihedral angles : : : : : : : : : : : : : : : : : : : : : 44 2.3.13 REORDER_ATOMS _ standardize order of MODEL atoms : : : : : : : : : : : : : : : : : : 44 2.3.14 REORDER2_ATOMS _ order MODEL atoms by MODEL2 : : : : : : : : : : : : : : : : : : 45 2.3.15 ROTATE_DIHEDRALS _ change dihedral angles : : : : : : : : : : : : : : : : : : : : : : : : 45 2.3.16 ORIENT_MODEL _ center and orient MODEL : : : : : : : : : : : : : : : : : : : : : : : : : 46 2.3.17 ROTATE_MODEL _ rotate and translate MODEL : : : : : : : : : : : : : : : : : : : : : : : 46 2.3.18 WRITE_DATA _ write derivative MODEL data : : : : : : : : : : : : : : : : : : : : : : : : : 47 2.4 Comparison and searching of sequences and structures : : : : : : : : : : : : : : : : : : : : : : : : : : 49 2.4.1 Alignment file format : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 49 2.4.2 READ_ALIGNMENT _ read sequences and/or their alignment : : : : : : : : : : : : : : : : 51 2.4.3 READ_ALIGNMENT2 _ read 2nd alignment : : : : : : : : : : : : : : : : : : : : : : : : : : 52 2.4.4 CHECK_ALIGNMENT _ check alignment for modeling : : : : : : : : : : : : : : : : : : : : : 52 2.4.5 COLOR_ALN_MODEL _ color MODEL according to alignment : : : : : : : : : : : : : : : : 53 2.4.6 COMPARE_ALIGNMENTS _ compare two alignments : : : : : : : : : : : : : : : : : : : : : 53 2.4.7 SEQUENCE_TO_ALI _ copy MODEL sequence and coordinates to alignment : : : : : : : : 54 2.4.8 WRITE_ALIGNMENT _ write sequences and/or their alignment : : : : : : : : : : : : : : : 55 2.4.9 DESCRIBE _ describe proteins : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 56 2.4.10 ID_TABLE _ calculate percentage sequence identities : : : : : : : : : : : : : : : : : : : : : : 56 2.4.11 SEQUENCE_COMPARISON _ compare sequences in alignment : : : : : : : : : : : : : : : : 57 2.4.12 DENDROGRAM _ clustering : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 58 2.4.13 PRINCIPAL_COMPONENTS _ clustering : : : : : : : : : : : : : : : : : : : : : : : : : : : : 58 CONTENTS v 2.4.14 ALIGN _ align two (blocks of) sequences : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 58 2.4.15 ALIGN2D _ align sequences with structures : : : : : : : : : : : : : : : : : : : : : : : : : : : 60 2.4.16 MALIGN _ align two or more sequences : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 62 2.4.17 ALIGN_CONSENSUS _ consensus sequence alignment : : : : : : : : : : : : : : : : : : : : : 62 2.4.18 SUPERPOSE _ superpose MODEL2 on MODEL given alignment : : : : : : : : : : : : : : : 63 2.4.19 COMPARE _ compare 3D structures given alignment : : : : : : : : : : : : : : : : : : : : : : 64 2.4.20 ALIGN3D _ align two structures : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 66 2.4.21 MALIGN3D _ align two or more structures : : : : : : : : : : : : : : : : : : : : : : : : : : : 67 2.4.22 EXPAND_ALIGNMENT _ put all models into alignment : : : : : : : : : : : : : : : : : : : : 69 2.4.23 SEQUENCE_SEARCH _ search for similar sequences : : : : : : : : : : : : : : : : : : : : : : 69 2.4.24 DELETE_ALIGNMENT _ delete alignment : : : : : : : : : : : : : : : : : : : : : : : : : : : 71 2.5 Calculation of spatial restraints : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 72 2.5.1 Specification of restraints : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 72 2.5.2 MAKE_RESTRAINTS _ make restraints : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 75 2.5.3 DEFINE_SYMMETRY _ define similar segments : : : : : : : : : : : : : : : : : : : : : : : : 78 2.5.4 PICK_RESTRAINTS _ pick restraints for selected atoms : : : : : : : : : : : : : : : : : : : : 80 2.5.5 CONDENSE_RESTRAINTS _ remove unselected restraints : : : : : : : : : : : : : : : : : : 81 2.5.6 ADD_RESTRAINT _ add restraint : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 81 2.5.7 DELETE_RESTRAINT _ unselect restraint : : : : : : : : : : : : : : : : : : : : : : : : : : : 82 2.5.8 REINDEX_RESTRAINTS _ renumber MODEL2 restraints for MODEL : : : : : : : : : : : 83 2.5.9 SPLINE_RESTRAINTS _ approximate restraints by splines : : : : : : : : : : : : : : : : : : 84 2.5.10 READ_RESTRAINTS _ read spatial restraints : : : : : : : : : : : : : : : : : : : : : : : : : 84 2.5.11 WRITE_RESTRAINTS _ write spatial restraints : : : : : : : : : : : : : : : : : : : : : : : : 84 2.6 Optimization of the model : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 86 2.6.1 MAKE_SCHEDULE _ create optimization schedule : : : : : : : : : : : : : : : : : : : : : : : 86 2.6.2 READ_SCHEDULE _ read optimization schedule : : : : : : : : : : : : : : : : : : : : : : : : 87 2.6.3 WRITE_SCHEDULE _ write optimization schedule : : : : : : : : : : : : : : : : : : : : : : : 87 2.6.4 ENERGY _ evaluate MODEL given restraints : : : : : : : : : : : : : : : : : : : : : : : : : : 88 2.6.5 ENERGY_PROFILE _ calculate energy profile of MODEL : : : : : : : : : : : : : : : : : : : 90 2.6.6 OPTIMIZE _ optimize MODEL given restraints : : : : : : : : : : : : : : : : : : : : : : : : 91 2.6.7 SWITCH_TRACE _ open new optimization trace file : : : : : : : : : : : : : : : : : : : : : : 95 2.6.8 DEBUG_FUNCTION _ test code self-consistency : : : : : : : : : : : : : : : : : : : : : : : : 96 3 Modeller scripts * *97 3.1 Flowchart of comparative modeling by Modeller : : : : : : : : : : : : : : : : : : : : : : : : : : : : 97 3.2 Script for comparative modeling : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 99 4 Frequently asked questions (FAQ) 103 5 Top , Modeller scripting language 113 5.1 The source file : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 113 5.2 Top Commands : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 114 5.2.1 DEFINE_INTEGER _ define integer variables : : : : : : : : : : : : : : : : : : : : : : : : : : 115 5.2.2 DEFINE_LOGICAL _ define logical variables : : : : : : : : : : : : : : : : : : : : : : : : : : 115 5.2.3 DEFINE_REAL _ define real variables : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 115 vi CONTENTS 5.2.4 DEFINE_STRING _ define string variables : : : : : : : : : : : : : : : : : : : : : : : : : : : : 115 5.2.5 SET _ set variable : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 115 5.2.6 OPERATE _ perform mathematic operation : : : : : : : : : : : : : : : : : : : : : : : : : : : 116 5.2.7 STRING_OPERATE _ perform string operation : : : : : : : : : : : : : : : : : : : : : : : : : 116 5.2.8 RESET _ reset Top : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 116 5.2.9 OPEN _ open input file : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 116 5.2.10 WRITE _ write Top objects : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 117 5.2.11 READ _ read record from input file : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 117 5.2.12 CLOSE _ close an input file : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 117 5.2.13 WRITE_TOP _ write the Top program : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 117 5.2.14 SYSTEM _ execute system command : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 118 5.2.15 INQUIRE _ check if file exists : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 118 5.2.16 GO_TO _ jump to label : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 118 5.2.17 LABEL _ place jump label : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 118 5.2.18 INCLUDE _ include Top file : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 118 5.2.19 CALL _ call Top subroutine : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 118 5.2.20 SUBROUTINE _ define Top subroutine : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 119 5.2.21 RETURN _ return from Top subroutine : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 119 5.2.22 END_SUBROUTINE _ end definition of Top subroutine : : : : : : : : : : : : : : : : : : : : 119 5.2.23 DO _ DO loop : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 119 5.2.24 IF _ conditional statement for numbers : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 119 5.2.25 STRING_IF _ conditional statement for strings : : : : : : : : : : : : : : : : : : : : : : : : : 119 5.2.26 STOP _ exit Top : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 120 5.3 Predefined Top variables : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 120 6 Methods 121 6.1 Dynamic programming for sequence and structure comparison and searching : : : : : : : : : : : : : 121 6.1.1 Pairwise comparison : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 121 6.1.2 Variable gap penalty : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 122 6.1.3 Local versus global alignment : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 122 6.1.4 Similarity versus distance scores : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 123 6.1.5 Multiple comparisons : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 123 6.2 Optimization of the objective function by Modeller : : : : : : : : : : : : : : : : : : : : : : : : : : 123 6.2.1 Function : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 123 6.2.2 Optimizers : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 124 6.3 Equations used in the derivation of the molecular pdf : : : : : : : : : : : : : : : : : : : : : : : : : : 125 6.3.1 Features and their derivatives : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 125 6.3.2 Restraints and their derivatives : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 127 6.4 List of commands, arguments, and default values : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 131 List of Figures 1.1 Comparative protein modeling by satisfaction of spatial restraints. : : : : : : : : : : : : : : : : : : : : 12 1.2 Sample spatial restraint. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 13 1.3 Optimization of the objective function. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 13 1.4 CPU-time for Modeller -4 for different modeling cases. : : : : : : : : : : : : : : : : : : : : : : : : : 20 vii viii LIST OF FIGURES List of Tables 1.1 CPU-time for Modeller -4 on various computers. : : : : : : : : : : : : : : : : : : : : : : : : : : : : 21 2.1 List of file types. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 26 2.2 List of mathematical forms of restraints. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 73 2.3 List of feature types that can be restrained. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 74 2.4 List of `physical' restraint types. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 74 2.5 Columns in an optimization trace file. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 95 3.1 List of Modeller scripts. : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 97 5.1 List of variable types in Top . : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 114 5.2 Predefined Top variables : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : : 120 ix x LIST OF TABLES Copyright notice Modeller , a protein structure modeling program. Copyright Oc 1989-1997 Andrej Sali. This program is distributed in the hope that it will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for any purpose. The entire risk as to the quality and performance of the program is with you. Distribution of the program is allowed only with the author's written consent. xi xii COPYRIGHT NOTICE Acknowledgments Andrej Sali is grateful to his PhD supervisor Professor Tom L. Blundell in whose laboratory at Birkbeck College the program was initiated. We would also like to thank Professor Martin Karplus who allowed some of the data in the Charmm topology and library files to be used with Modeller . Dr. John P. Overington contributed by making most of the alignments for the derivation of the current mainchain and sidechain dihedral angle pdf's. We are grateful to all Modeller users who pointed out problems and made many suggestions, especially to John P. Overington, Andras Szilagyi, Stefan Grzybek, and Karl-Heintz Ott. We would also like to thank Eric Feyfant for his help in improving the manual. Modeller was written when at 1989-1990: Department of Crystallography, Birkbeck College University of London, Malet St, London WC1E 7HX, UK. 1990-1991: ICRF Unit of Structural Molecular Biology, Birkbeck College Malet St, London WC1E 7HX, UK. 1991-1994: Department of Chemistry, Harvard University 12 Oxford St, Cambridge, MA 02138, USA. 1995-1997: The Rockefeller University, 1230 York Ave, New York, NY 10021, USA. xiii xiv ACKNOWLEDGMENTS Chapter 1 Introduction 1.1 What is Modeller ? Modeller is a computer program that models protein 3D structure by satisfaction of spatial restraints. Modeller is most frequently used for homology or comparative protein structure modeling: The user provides an alignment of a sequence to be modeled with known related structures and Modeller will automatically calculate an all-atom model. More generally, the input to the program are restraints on the spatial structure of the amino acid sequence(s) and ligands to be modeled. The output is a 3D structure that satisfies these restraints as well as possible. Restraints can in principle be derived from a number of different sources. These include homologous structures (comparative modeling), NMR experiments (NMR refinement), rules of secondary structure packing (combinatorial modeling), cross-linking experiments, fluorescence spectroscopy, image reconstruction in electron microscopy, site-directed mutagenesis, intuition, residue-residue and atom-atom potentials of mean force, etc . The restraints can operate on distances, angles, dihedral angles, and pairs of dihedral angles defined by atoms or pseudo atoms. Presently, Modeller automatically derives the restraints only from the known homologous structures and their alignment with the target sequence. The model of 3D structure is obtained by optimization of a molecular probability density function (pdf). The molecular pdf is optimized with the variable target function procedure in Cartesian space that employs methods of conjugate gradients and molecular dynamics with simulated annealing. Modeller can also do multiple comparison of protein sequences and/or structures, clustering of proteins, and searching of sequence databases. The program is used with a scripting language and does not include any graphics. It is written in an almost standard Fortran 77 and is meant to run on a Unix computer. 1 2 CHAPTER 1. INTRODUCTION 1.2 Modeller bibliography More information about Modeller and its applications can be found in the following sources. 1. A. Sali and T.L. Blundell. Definition of general topological equivalence in protein structures: A procedure involving comparison of properties and relationships through simulated annealing and dynamic programming. J. Mol. Biol. 212, 403-428, 1990. An introduction of the Modeller idea. 2. A. Sali, J.P. Overington, M.S. Johnson, and T.L. Blundell. From comparisons of protein sequences and structures to protein modelling and design. TIBS 15, 235-240, 1990. An introduction of the Modeller idea. 3. A. Sali. Modelling three-dimensional structure of proteins from their sequence of amino acid residues. University of London. PhD Thesis, 1991. An early detailed description of Modeller . 4. A. Sali and T.L. Blundell. Comparative protein modelling by satisfaction of spatial restraints. J. Mol. Biol. 234, 779-815, 1993. The main Modeller paper that describes most of the method in detail. 5. A. Sali and T.L. Blundell. Comparative protein modelling by satisfaction of spatial restraints. In Protein Structure by Distance Analysis, H. Bohr, and S. Brunak, eds. IOS Press, Amsterdam, pp. 64-86, 1994. A summary of the main Modeller paper (4). 6. A. Sali and J.P. Overington. Derivation of rules for comparative protein modeling from a database of protein structure alignments. Protein Sci., 3, 1582-1596, 1994. Description of disulfide and cis-proline restraints and of the database of family alignments used to derive some o* *ther Modeller restraints. 7. A. Sali, R. Matsumoto, H.P. McNeil, M. Karplus, and R.L. Stevens. Three-dimensional models of four mouse mast cell chymases. Identification of proteoglycan-binding regions and protease-specific antigenic epitopes. J. Biol. Chem. 268, 9023-9034, 1993. The first application of Modeller to an experimental problem. 8. R. Matsumoto, A. Sali, N. Ghildyal, M. Karplus, and R.L. Stevens. The cluster of histidines in mouse mast cell protease-7 regulates its binding to heparin serglycin proteoglycan inside and outside of the secretory granule. J. Biol. Chem. 270, 19524-19531, 1995. A combined experimental/modeling study. The experimental verification of predictions based on a Modeller model. 9. A. Sali, L. Potterton, F. Yuan, H. van Vlijmen, and M. Karplus. Evaluation of comparative protein modeling by Modeller . Proteins 23, 318-326, 1995. Evaluation of three bona fide Modeller models submitted to the CASP1 conference in December, 1994. 10. A. Sali. Modelling mutations and homologous proteins, Curr. Opin. Biotech., 5, 437-451, 1995. A longish review of comparative modeling. 11. A. Sali. Protein modeling by satisfaction of spatial restraints. Mol. Med. Today 1, 270-277, 1995. A short review of comparative modeling by satisfaction of spatial restraints. 12. Ghildyal, N., Friend, D. S., Stevens, R. L., Austen, K. F., Huang, C., Penrose, J., Sali, A., and Gurish, M. F. (1996). Fate of two mast cell tryptases following passive systemic anaphylaxis of BALB/c and V3 mastocytosis mice. Prolonged retention of exocytosed mMCP-6 in connective tissues and rapid accumulation of enzymatically 1.2. MODELLER BIBLIOGRAPHY 3 active mMCP-7 in the blood chymases. J. Exp. Med. 184, 1061-1073. A combined experimental/modeling study. Modeller models rationalize several observed differences between the two mast cell tryptases. 13. Sheng, Y., Sali, A., Herzog, H., Lahnstein, J., and Krilis, S. (1996). Modelling, expression and site-directed mutagenesis of human fi2-glycoprotein I: Identification of the major phospholipid binding site. J. Immunol. 157, 3744-3751. A combined experimental/modeling study. A Modeller model is used to predict the phospholipid binding site on fi2- glycoprotein I, subsequently confirmed by site-directed mutagenesis. 14. Xu, L. Z.,aSnchez, R., Sali, A., and Heintz, N. (1996). Ligand specificity of brain lipid binding protein. J.Biol.Chem. 271, 24711-24719. A combined experimental/modeling study. Modeller models are used to rationalize and predict ligand specificities of brain lipid binding protein. 15. Wu, S., de Lencastre, H., Sali, A., and Tomasz, A. (1996). A phosphoglucomutase-like gene essential for the optimal expression of methicillin resistance in Staphylococcus aureus: Molecular cloning and DNA sequencing. Microbial Drug Resistance 2, 277-286. An application of Modeller to finding homologs of a given sequence. 16. aSnchez, R. and Sali, A. (1996). Comparative protein modeling as an optimization problem. Journal of Molecular Structure (Theochem) , in press. A review summarizing the Modeller approach to comparative modeling, its flexibility, and errors in the resulting models. 17. aSnchez, R. and Sali, A. (1997). Advances in comparative protein structure modeling. Curr. Opin. Str. Biol. 7, 206-214. A review of comparative modeling. 18. Koulich, D., Orlova, M., Malhotra, A., Sali, A., Darst, S.A. and Borukhov, S. (1997). Domain organization of Escherichia coli transcript cleavage factors GreA and GreB. J. Biol. Chem. 272, 7201-7210, 1997. An application of Modeller . 19. J.E. Hunt, D.S. Friend, M.F. Gurish, E. Feyfant, A. Sali, C. Huang, N. Ghildyal, S. Stechshulte, K. Frank Austen and R.L. Stevens (1997). Mouse mast cell protease (mMCP) 9, a novel member of the chromosome 14 family of serine proteases that is selectively expressed in uterine mast cells. J. Biol. Chem., in press. An application of Modeller . 20. C. Huang, G.W. Wong, A. Sali, N. Ghildyal, M.F. Gurish, R. Matsumoto and R.L. Stevens (1997). The tryptase, mouse mast cell protease 7, is a potent anticoagulant due to its ability to degrade fibrinogen. Submitted. An application of Modeller . 21. R.aSnchez and A. Sali (1997). Evaluation of comparative protein structure modeling by MODELLER-3. Proteins, submitted. Evaluation of three bona fide Modeller -3 models submitted to the CASP2 conference in December, 1996. Please quote reference 4 in your publications using Modeller . 4 CHAPTER 1. INTRODUCTION I will greatly appreciate it you send me copies of any publications using Modeller to Andrej Sali The Rockefeller University 1230 York Avenue New York, NY 10021-6399 USA tel +1 212 327 7550 fax +1 212 327 7540 (or 7974) email sali@rockvax.rockefeller.edu URL http://guitar.rockefeller.edu/ 1.3. DISTRIBUTION 5 1.3 Distribution Modeller is available free of charge to academic non-profit institutions. First, please use the anonymous ftp account on guitar.rockefeller.edu (IP 129.85.13.198) to copy at least the following files from the pub/modeller directory to your computer: the license form (PostScript file academic-- license.ps), the distribution file that contains the data files necessary to run Modeller (modeller4-data.tar.Z), and an executable for each machine type that you want to use (mod_*). The executables differ in the machine type, the operating system version, and the size of a modeling problem they can address. Please see file INSTALLATION for a description of the executables. Next, please complete and mail or fax the license form to Andrej Sali The Rockefeller University 1230 York Avenue New York, NY 10021-6399 USA tel +1 212 327 7550 fax +1 212 327 7540 (or 7974) email sali@rockvax.rockefeller.edu URL http://guitar.rockefeller.edu/ You will then receive a string that has to be assigned to the environment variable KEY_MODELLER4 in your login script (e.g., '.cshrc') before you can run Modeller . See file INSTALLATION for installation instructions (Section 1.4). There is also a Modeller home page on WorldWideWeb at URL http://guitar.rockefeller.edu/ that can be used to download the program and view the manual. A graphical interface to Modeller is available as part of Quanta , InsightII , and Weblab GeneExplorer , interactive molecular modeling programs from MSI, San Diego, with many tools for protein modeling and structural analysis. Quanta , InsightII and GeneExplorer facilitate preparation of input files for Modeller (e.g., an alignment file) as well as an analysis of results (e.g., an evaluation of the models). If you are interested in these programs, please contact Ms. Brenda Pfeiffer Molecular Simulations Inc. 9685 Scranton Road San Diego, CA 92121-3752 USA tel +1-619-546-5319 fax +1-619-458-0136 email blp@msi.com URL http://www.msi.com/ 6 CHAPTER 1. INTRODUCTION 1.4 Installation The following installation instructions are from the INSTALLATION file in the root directory of the Modeller distribution. See Section 1.3 for how to obtain Modeller . INSTALLATION MODELLER PROTEIN STRUCTURE MODELING BY SATISFACTION OF SPATIAL RESTRAINTS v Copyright(c) 1989-1997 Andrej Sali All Rights Reserved Platforms: MODELLER compiles on Silicon Graphics R4000, R8000, and R10000 machines, Sun SPARC, IBM RS/6000, DEC Alphastation, Hewlett-Packard HP-9000/700, PCs under Linux, with the public domain f2c Fortran-to-C translator, and GNU g77. The latter two compilers allow MODELLER to run on virtually any UNIX computer. Installation: See file modeller4.README for information about how to get MODELLER. The source code is not yet generally available. Thus, most users will be limited to the executables on the MODELLER ftp sites. MODELLER is distributed in two parts: (1) compressed tar archive modeller4-data.tar.Z (scripts, libraries, examples, and documentation in text, PostScript and HTML forms) and (2) executables for various machine types. Once you obtained the modeller4-data.tar.Z file and the executable, installation consists of unpacking the archive file, moving the correct executable to directory src/main in the unpacked distribution, and finally running the installation script that copies MODELLER to its final installed location. More precisely: **1) Unpack the distribution files: zcat modeller4-data.tar.Z _ tar xvf - uncompress mod*Z The result of unpacking will be directory ./modeller4 and uncompressed executable files. Copy the appropriate executable to directory modeller4/src/main and rename it to mod_HOSTTYPE where HOSTTYPE is your machine type (see Section 7); e.g., for the default SGI executable cp mod_iris4d modeller4/src/main/mod_iris4d 1.4. INSTALLATION 7 or for the SMALL SGI executable: cp mod_iris4d-SMALL modeller4/src/main/mod_iris4d See Section 7 for more information about executables. Change to directory ./modeller4, execute script 'Install' and answer the questions it asks: cd modeller4 ./Install If everything is OK go to step 6. If you have any problems, you can try to install the program by hand using the instructions listed in steps 2) - 6) below. It is assumed that the users' login shell will be csh compatible (e.g., csh, tcsh). If not, you have to edit file scripts/setmodeller and scripts/logscript.template so that they conform to the users login shell format before you run the Install script. **2) If you use tcsh Ver 6 or later, the HOSTTYPE environment variable is defined automatically and you can skip this step. Make sure the HOSTTYPE environment variable is defined by doing either: a) Edit the script `scripts/hosttype' to produce the correct result on your computer. or b) Define HOSTTYPE in your login script file: for sh : HOSTTYPE=iris4d; export HOSTTYPE for csh: setenv HOSTTYPE iris4d Source your login script: for sh : . script_file for csh: source script_file **3) Set the environment variable MODINSTALL to the directory where you want to have MODELLER installed. For example, for the tcsh or csh shells: setenv MODINSTALL /usr/local/bin/modeller4 Note that MODINSTALL must NOT be the same as the unpacked directory (./modeller4). 8 CHAPTER 1. INTRODUCTION **4) If you have the source, compile and install the program and library files: cd src/main ; make opts install If you do not have the source code, make sure you copied the appropriate executable to src/main, as described above, and install the program and library files: cd src/main ; make install If you are not using one of the machines listed above you will probably have to modify scripts/Makefile.include1. Hopefully, you won't have to modify the Fortran code. **5) Change your login script to include the following (for csh or tcsh): # Root directory for installed MODELLER: setenv MODINSTALL /usr/local/bin/modeller4 # Root directory for the Protein DataBank (not essential, can be omitted): setenv PDB /usr/local/pdb # MODELLER key (obtained from Andrej Sali at sali@rockvax.rockefeller.edu): setenv KEY MODELLER_KEY # Set MODELLER environment variables and update the command path: if (-e $MODINSTALL/bin/setmodeller) source $MODINSTALL/bin/setmodeller **6) Source your login script. Execute 'rehash'. You can now start using MODELLER. See examples in the examples" directory. You may delete the MODELLER distribution directory. **7) Executable types: The executables are named mod_HOSTTYPE-CODE1-CODE2-..., where HOSTTYPE is the machine type, and CODE1, CODE2, etc. are optional keywords describing the type of the executable: a) HOSTTYPE codes: ------------------------------------------------------- CODE MACHINE ------------------------------------------------------- _iris4d Silicon Graphics R3000 and R4000, IRIX 5.3 _r8000 Silicon Graphics R8000, IRIX 6.1 _r10000 Silicon Graphics R10000, IRIX 6.2 _alpha DEC Alpha, OSF1 3.2 _hp9000s700 Hewlett-Packard 9000/700, HP-UX A.09.05 _rs6000 IBM RS6000, AIX 3.2 _sol2 SUN Solaris 2, Sun OS 5.4 1.4. INSTALLATION 9 _i486-linux PC Linux 2.0.18, i586 ------------------------------------------------------- b) Array size codes: --------------------------------------------------------------------- CODE Max.residues Max.atoms Max.len.alignment Max.numb.sequences --------------------------------------------------------------------- no code 1000 15000 1000 40 TINY 500 75000 500 5 SMALL 1000 15000 1000 5 HUGE 1000 15000 1000 101 ENORMOUS 2000 30000 2000 40 --------------------------------------------------------------------- c) Numerical precision codes: ----------------------------- CODE Numerical_precision ----------------------------- DOUBLE double (real*8) no code single (real*4) ----------------------------- d) Operating system: self-evident. **8) Swap space: If the executable is `killed' or `broken' immediately, the most likely reason is that the maximal size of a process on your machine is not large enough. On an SGI, the default executable requests approximately 90 MB of memory (about 200MB on most other machines), although most of it is never used at the same time: MODELLER usually runs within 20 MB of memory. The exact amount of memory demanded at start up time can be found by the `size exec_filename' command. The large memory request is a consequence of an inefficient use of common blocks by MODELLER. In addition, a feature of the UNIX system that copies the memory space of the parent process for each forked child process, which happens during a call to the system() routine, results in double memory needs if compressed atom files are used. To solve these memory problems until a new f90 version with dynamic memory allocation becomes available, increase the swap space on your machine: On an SGI system, you can create virtual, rather than logical swap space since most of the memory required by MODELLER is never used at the same time. Since MODELLER runs usually take a small amount of real memory, the virtual swap space, which does not take any disk or RAM space, can be very large. To increase the swap space, you can use the `System/System Manager/Swap Manager' menu sequence or the following procedure to make the swap space permanent irrespective of machine re-boots: 10 CHAPTER 1. INTRODUCTION /usr/sbin/mkfile 256m /swap/swap1 # create 256MB swap file /swap/swap1 chkconfig vswap off # eliminate duplicate swap space Add this entry to /etc/fstab to use the 256MB of real swap and another 256MB of virtual swap: /swap/swap1 swap swap pri=2,vlength=1024000 0 0 # vlength in 512B blocks On a SUN Solaris system, swap space is added by creating a swap file (you must have root privilege): mkfile -v 128m extra_swap /* create 128 MB swap file */ swap -a extra_swap /* Let the system know about it */ In order to make these changes permanent add the following line to the vfstab file extra_swap - - swap - no - 1.5. BUG REPORTS 11 1.5 Bug reports Please report Modeller bugs by e-mail to Andrej Sali at sali@rockvax.rockefeller.edu. It is best if you e-mail an uuencoded compressed tar archive of the directory with all input and output files: tar cvf name.tar ./directory_name compress name.tar uuencode name.tar.Z name.tar.Z > name.uue mail -s 'MODELLER bug' sali@rockvax.rockefeller.edu < name.uue If the problem is not apparent in the log files, please include in the archive a README file with a short description of the problem. 12 CHAPTER 1. INTRODUCTION 1.6 Method for comparative protein structure modeling by Modeller Modeller is an implementation of an automated approach to comparative protein structure modeling by satis- faction of spatial restraints (Figure 1.1) [Sali & Blundell, 1993, Sali & Overington, 1994, Sali et al., 1995, Sali, 1995]. The method and its applications to biological problems are described in detail in references listed in Section 1.2. Briefly, the modeling procedure begins with an alignment of the sequence to be modeled (target) with related known 3D structures (templates). This alignment is usually the input to the program. The output is a 3D model for the target sequence containing all mainchain and sidechain non-hydrogen atoms. Given an alignment, the model is obtained without any user intervention. First, many distance and dihedral angle restraints on the target sequence are calculated from its alignment with template 3D structures (Figure 1.2). The form of these restraints was obtained from a statistical analysis of the relationships between many pairs of homologous structures. This analysis relied on a database of 105 family alignments that included 416 proteins with known 3D structure [Sali & Overington, 1994]. By scanning the database, tables quantifying various correlations were obtained, such as the correlations between two equivalent C ff - C ff distances, or between equivalent mainchain dihedral angles from two related proteins. These relationships were expressed as conditional probability density functions (pdf's) and can be used directly as spatial restraints. For example, probabilities for different values of the mainchain dihedral angles are calculated from the type of a residue considered, from mainchain conformation of an equivalent residue, and from sequence similarity between the two proteins. Another example is the pdf for a certain C ff-C ff distance given equivalent distances in two related protein structures (Figure 1.2). An important feature of the method is that the spatial restraints are obtained empirically, from a database of protein structure alignments. Next, the spatial restraints and Charmm energy terms enforcing proper stereochemistry [Brooks et al., 1983] are combined into an objective function. Finally, the model is obtained by optimizing the objective function in Cartesian space. The optimization is carried out by the use of the variable target function method [Braun & G"o, 1985] employing methods of conjugate gradients and molecular dynamics with simulated annealing (Figure 1.3). Several slightly different models can be calculated by varying the initial structure. The variability among these models can be used to estimate the errors in the corresponding regions of the fold. Figure 1.1: Comparative protein modeling by satisfaction of spatial restraints. First, the known, template 3D structures (`3D') are aligned with the target sequence to be modeled (`SEQ') Second, spatial features, such as C ff-C ffdistances, hydrogen bonds, and mainchain and sidechain dihedral angles, are transferred from the templates to the target. Thus, a number of spatial restraints on its structure are obtained. Third, the 3D model is obtained by satisfying all the restraints as well as possible. _______________________________________________________ 1.6. METHOD FOR COMPARATIVE PROTEIN STRUCTURE MODELING BY MODELLER 13 Figure 1.2: Sample spatial restraint. A restraint on a given C ff-C ffdistance, d, is expressed as a conditional probability density function that depends on two other equivalent distances (d0 = 17:0 and d00 = 23:5): p(d=d0; d00). The restraint (continuous line) is obtained by least-squares fitting a sum of two Gaussian functions to the histogram, which in turn is derived from the database of alignments of protein structures. In practice, more complicated restraints are used that depend on additional information, such as similarity between the proteins, solvent accessibility, and distance from a gap in the alignment [Sali & Blundell, 1993]. _______________________________________________________ Figure 1.3: Optimization of the objective function. Optimization of the objective function (curve) starts with a distorted average of template structures (not with an extended structure as shown here). The iteration number is indicated below each sample structure. In this run, the first ~ 2; 000 iterations correspond to the variable target function method relying on the conjugate gradients technique. This approach first satisfies sequentially local restraints and slowly introduces longer range restraints until the complete objective function is optimized. In the last 4,750 iterations for this model, molecular dynamics with simulated annealing is used to refine the model. CPU times needed to generate 3D models are shown in Figure 1.4 and Table 1.1. _______________________________________________________ 14 CHAPTER 1. INTRODUCTION 1.7 Comparative protein modeling primer This section outlines all the stages in a comprehensive comparative modeling session. In contrast, the tutorial section (Section 1.8) describes a hands-on example illustrating a simple modeling case. Many "frequently-asked- questions" (FAQ) are answered in Chapter 4. With a convenient structure display and manipulation program, such as Quanta and InsightII (MSI), San Diego, CA; e-mail blp@msi.com, it should be possible to do even the difficult modeling cases in only a few days. To find sample script files for all the Modeller tasks discussed in this section, read file examples/tutorial-all/README. To obtain additional examples of Top scripts using a certain command, explore the examples directory, especially the examples/commands sub-directory. 1.7.1 Finding structures and sequences related to the target sequence Before any modeling can begin, the sequences and segments with known 3D structures that are related to the sequence being modeled must be found. This can be achieved by the Modeller SEQUENCE_SEARCH com- mand. The search relies on a database of structures that are representative [Sali et al., 1995] of the whole Brookhaven Protein Databank (PDB) [Abola et al., 1987]. The PDB codes of these representative structures (about 1,000 codes) are listed in file modlib/CHAINS_3.0_30_XN.cod and their sequences are stored in file modlib/CHAINS_all.seq, which includes about 6,000 sequences for all the unique non-model chains in PDB longer than 25 amino acid residues. The representative structures are likely to have less than 30% sequence identity to each other and the length difference that is at least 30% of the shorter chain or 30 amino acid residues, whichever is smaller. The codes of other known PDB structures related to the representative structures at >30% sequence identity are listed in file modlib/CHAINS_3.0_30_XN.grp. A sample Top script for searching by SEQUENCE_SEARCH is in examples/tutorial-all/search.top. Sequences related to the target are identified by their Z-scores that are larger than 4 or 5 (log file column SIGNIF). For more difficult modeling problems when SEQUENCE_SEARCH does not find any homologs, template matching or threading methods can be used. Widely used programs for threading include Profit [Flockner et al., 1995], Threader [Jones et al., 1992], MatchMaker [Godzik et al., 1992], and the Web server of the David Eisenberg group at UCLA (http://www.mbi.ucla.edu/people/frsvr/frsvr). Templates for loop regions can be found by scanning the whole PDB for segments that fit on the two core regions spanning the loop. These short segments with known 3D structure can be used as templates in exactly the same way as the whole protein structures. It may be beneficial to identify related sequences without known 3D structures at this stage, possibly by FASTA (http://www.ebi.ac.uk/cgi-bin/rbanner/index.cgi), BLAST (http://www.ncbi.nlm.nih.gov/BLAST/), more sensitive dynamic programming searches (http://sgbcd.weizmann.ac.il/), and many other programs that are available through various Internet servers (http://viol.rockefeller.edu/"roberto/tools/tools.html). Using as many sequences as possible may improve the quality of the alignment prepared in the next two stages. 1.7.2 Preparing an initial family alignment of all structures and sequences The second stage is to prepare a multiple alignment of all the structures and sequences in the family of in- terest. An initial structural alignment of the structures can be obtained by the MALIGN3D command. It is usually better to include all the structures related to the representative PDB structures, as listed in the modlib/CHAINS_3.0_30_XN.grp file. Using the ALIGN2D, ALIGN or MALIGN commands, the multiple struc- tural alignment can then be aligned as one block with all the related sequences. Unfortunately, in most cases, automatically derived alignments have to be manually edited to optimize the quality of the model derived from the alignment. This is the task for the next stage. 1.7.3 Becoming familiar with the family fold and improving the alignment This is a very important stage because it may result in a significantly improved model. It mostly involves visual in- spection of the superposed structures on a graphics terminal, using a program such as Rasmol (Roger Sayle, Glaxo, http://www.umass.edu/microbio/rasmol/), Quanta or InsightII . However, the CHECK_ALIGNMENT command of Modeller should also be used. You can obtain files with multiply superposed structures by using the MALIGN3D command with WRITE_FIT set to on (you can use any alignment for this purpose). The aim at 1.7. COMPARATIVE PROTEIN MODELING PRIMER 15 this stage is to study the family fold, to establish the relationships between various members of the family, and to determine which regions are more conserved and which regions are more variable. This information is used to improve the initial automatically derived alignment. For example, if necessary, gaps are removed from helices and strands; they should be moved into those exposed regions that show large variations in the family of known structures and to the tips of loops. The role of disulfides and cis-prolines, if any, is noted. Modeller will try to deal with those two features automatically but it is prudent to be careful. It has to be decided whether or not to build models for multi-subunit assemblies and whether or not to include various ligands, such as water molecules, cofactors, metal ions, inhibitors, or substrates. Quanta and InsightII have a set of tools that facilitate inspection and editing of multiple alignments. In order to obtain the best possible model, it is very important to understand how the alignment is used by Modeller [Sali & Blundell, 1993]. In outline, for the aligned regions, Modeller tries to derive a 3D model for the target sequence that is as close to one or the other of the template structures as possible while also satisfying stereochemical restraints (e.g., bond lengths, angles, non-bonded atom contacts, . . . ); the inserted regions, which do not have any equivalent segments in any of the templates, are modeled in the context of the whole molecule, but using their sequence alone. This way of deriving a model means that whenever a user aligns a target residue with a template residue, he tells Modeller to treat the aligned residues as structurally equivalent. 1.7.4 Selecting the templates The new improved alignment is input to the ID_TABLE or COMPARE_SEQUENCES commands to construct a matrix of pairwise sequence distances. This matrix is then used either to prepare an `evolutionary' tree for the whole family or to cluster the proteins by the principal components technique available through the PRINCI- PAL __COMPONENTS command of Modeller . For evolutionary trees, the DENDROGRAM command of Modeller or the Phylip program written by Joe Felsenstein can be used (you can get Phylip by anonymous FTP from evolution.genetics.washington.edu/pub/phylip) [Felsenstein, 1985]. The clustering is then exam- ined to decide which known structures are suitable templates for model building in the next stage. Usually, all significantly different structures in the cluster that contains the target sequence are used. It is not always best to use all related 3D structures as templates because the objective function may become too rugged, sometimes resulting in sub-optimal solutions (e.g., six templates is a large number of templates). It also does not make sense to include two relatively similar templates solved at a high and low resolution; use only the high resolution template. Depending on the modeling problem at hand, other factors can be considered in the selection of templates, such as ligands bound to the template and/or target, whether the template structure was solved in solution or in a crystal, etc. Moreover, more experienced users can try to use a smaller number of templates for mainchain distance restraints and a larger number of templates for sidechain conformation, but that involves editing the Top scripts for comparative modeling. Also, as mentioned above, templates can be very short, such as loops from unrelated protein structures that fit on the given framework regions; for example, canonical loops [Chothia & Lesk, 1987] could be used as templates in modeling a complementarity determining region of an immunoglobulin. 1.7.5 Model building The alignment and the list of templates are used by Modeller to derive several slightly different models automat- ically. This stage is straightforward and is described in the tutorial (Section 1.8). However, do check the log file for error messages by searching for the `_E>' string. Usually, the representative model is that which has the lowest value of the molecular pdf. If the models are constructed by the standard procedure, this value is reported in the log file as well as in the REMARK record of the output PDB files with the model. 1.7.6 Evaluating the models The model is evaluated internally and externally. The internal self-consistency check is that the model has to satisfy most restraints used to calculate it, especially the stereochemical restraints. If some restraints are grossly violated in all the models it is likely that the alignment in the corresponding region is incorrect. The restraint violations are reported by the ENERGY command and can be found at the end of the log file. External tests include programs such as Procheck by Roman Laskowski and Janet Thornton (anonymous FTP at ftp.biochem.ucl.ac.uk) [Laskowski et al., 1993], and various 3D profile tests including the Profile3D program written in the David 16 CHAPTER 1. INTRODUCTION Eisenberg group (e-mail david@uclaue.mbi.ucla.edu) [L"uthy et al., 1992], ProsaII written in the Manfred Sippl group (anonymous FTP at gundi.came.sbg.ac.at) [Sippl, 1993], and MatchMaker written by Adam Godzik in the Jeff Skolnick group (e-mail adam@scripps.edu) [Godzik et al., 1992]. Quanta and InsightII also offer various test options. It is useful to compare the models among themselves because those regions that are most variable are also likely to be most in error. Another useful comparison is between the representative model and the templates. Start by comparing the C ff traces, then continue with the backbone comparison and finally include the sidechains. The aim of evaluations is to determine whether or not the model is acceptable. If it is not acceptable, that is if the current model violates some restraints, fails the profile tests, or simply does not appear satisfactory, these evaluations should help to re-align the target sequence and the templates for the next cycle of modeling. It may also be that the model violates restraints because the optimizer did not find a good optimum of the objective function. In such a case, you could make the optimization more thorough using options to the `model' routine (Section 3.2). At this stage, it may also be useful to add template regions for loops that do not have any related segments in related structures. Such loop templates may be found by scanning the whole database of structures for those segments that can span the anchor regions in the target model. Quanta and InsightII have such a scanning facility. 1.7.7 Repeat the cycle The cycle of template selection, alignment, modeling, and evaluation should be repeated until the model is good enough or until no further improvement is possible. 1.8. TUTORIAL ON USING MODELLER FOR COMPARATIVE MODELING 17 1.8 Tutorial on using Modeller for comparative modeling This section is a `hands on' description of the most basic use of Modeller in comparative modeling. For an outline of the main stages in comparative modeling, see Section 1.7. For "frequently-asked-questions" (FAQ), see Chapter 4. The input are Brookhaven Protein Databank (PDB) atom files of known protein structures and their alignment with the target sequence to be modeled. The output is a model for the target which includes all non-hydrogen atoms. Although Modeller can calculate sequence and structure alignments, it is better to prepare the alignment carefully by other means. The alignment can also contain very short segments such as loops, secondary structure motifs, etc . This tutorial assumes that Modeller is already installed on your computer and that appropriate changes have been made to your login script to install you as a Modeller user. See Section 1.4 for more details on installation (also in the INSTALLATION file in the Modeller distribution directory). 1.8.1 Preparing input files The sample input files in this tutorial can be found in the examples/tutorial-model directory of the Modeller distribution. There are three kinds of input files: Brookhaven Protein Databank atom files with coordinates for the template structures, the alignment file with the alignment of the template structures with the target sequence, and the Modeller command or script file that tells Modeller what to do. Atom files Each atom file is named code.atm where code is a short protein code, preferably the PDB code; for example, Peptococcus aerogenes ferredoxin would be in a file 1fdx.atm. The code must be used as that protein's identifier throughout the modeling. The atom sets do not have to be superposed by the user before comparative modeling is done. Alignment file One of the formats for the alignment file is related to the PIR database format; this is the preferred format for comparative modeling: C; A sample alignment in the PIR format; used in tutorial >P1;5fd1 structureX:5fd1: 1 : : 106 : :ferredoxin:Azotobacter vinelandii: 1.90:0.192 AFVVTDNCIKCKYTDCVEVCPVDCFYEGPNFLVIHPDECIDCALCEPECPAQAIFSEDEVPEDMQEFIQLNAELA EVWPNITEKKDPLPDAEDWDGVKGKLQHLER* >P1;1fdx sequence:1fdx: 1 : : 54 : :ferredoxin:Peptococcus aerogenes: 2.00:-1.00 AYVINDSC--IACGACKPECPVNIIQGS--IYAIDADSCIDCGSCASVCPVGAPNPED* See Section 2.4.1 for a detailed description of the alignment file format and Section 1.7.3 for the meaning of the alignment in Modeller . Influence of the alignment on the quality of the model cannot be overemphasized. Command CHECK_ALIGNMENT can be used to find some trivial alignment mistakes. Script file The script file contains commands for Modeller , in the Top language (Chapter 5). A sample script file model-default.top to produce one model of sequence 1fdx from the known structure of 5fd1 and from the alignment between the two sequences is 18 CHAPTER 1. INTRODUCTION # Homology modelling by the MODELLER TOP routine 'model'. INCLUDE # Include the predefined TOP routines SET ALNFILE = 'alignment.ali' # alignment filename SET KNOWNS = '5fd1' # codes of the templates SET SEQUENCE = '1fdx' # code of the target SET ATOM_FILES_DIRECTORY = './:../atom_files' # directories for input atom files SET STARTING_MODEL= 1 # index of the first model SET ENDING_MODEL = 1 # index of the last model # (determines how many models to calculate) CALL ROUTINE = 'model' # do homology modelling See Section 3.2 for information on the model script and its arguments. 1.8.2 Running Modeller To run Modeller with the script file model-default.top, execute the following command: mod model-default A number of intermediary files are created as the program proceeds. After about 1 minute on an SGI Indigo Impact workstation, the final 1fdx model is written to file 1fdx.B99990001. Examine the model-default.log file for information about the run. In particular, one should always check the output of the CHECK_ALIGNMENT command, which you can find by searching for `chkaln'. Also, check for warning and error messages by searching for `_W>' and `_E>', respectively. There should be no error messages; most often, there are some warning messages that can usually be ignored. 1.8.3 Fully automated comparative modeling Fully automated comparative modeling requires only the target sequence and the coordinates of templates. The structural alignment of the known 3D structures and their alignment with the target sequence is derived auto- matically. However, the single most important factor that determines the quality of a model is the quality of the alignment. If the alignment is incorrect, the model will also be incorrect. For this reason, the fully automated option for comparative modeling should not be used unless the sequences are so sim- ilar that the calculated alignment is likely to be correct (this usually requires more than 50% sequence identity). Instead, the alignment should be carefully inspected, optimized by hand, and checked by the CHECK_ALIGNMENT command before used in modeling (Section 1.7). Moreover, several iterations of alignment and modeling may be necessary in general. See Section 1.7 for an outline of a general comparative modeling case. The sample input files for fully automated comparative modeling are located in directory examples/tutorial-align-model. The sample Top file is # A sample TOP file for fully automated comparative modeling INCLUDE # include MODELLER routines SET ATOM_FILES_DIRECTORY = './:../atom_files' # directory with input atom files SET SEGFILE = 'alignment.seg' # input file w/ templates and target SET KNOWNS = '5fd1' '1fdn' '1fxd' '2fxb' # templates' PDB codes SET SEQUENCE = '1fdx' # target code CALL ROUTINE = 'full_homol' # get alignment and a model The alignment.seg file is 1.8. TUTORIAL ON USING MODELLER FOR COMPARATIVE MODELING 19 >P1;1fdx structureX:1fdx:@:@:54:@:ferredoxin:Peptococcus aerogenes: 2.00:-1.00 AYVINDSCIACGACKPECPVNIIQGSIYAIDADSCIDCGSCASVCPVGAPNPED* >P1;1fdn structureX:1fdn:@:@:55:@:ferredoxin:Clostrodium acidiurici: 1.84:-1.0 * >P1;5fd1 structureX:5fd1:@:@:60:@:ferredoxin:Azotobacter vinelandii: 1.90:0.192 * >P1;1fxd structureX:1fxd:@:@:58:@:ferredoxin:Desolfovibrio gigas: 1.70:-1.0 * >P1;2fxb structureX:2fxb:@:@:60:@:ferredoxin:Bacillus thermoproteolyticus: 2.30:-1.0 * 20 CHAPTER 1. INTRODUCTION 1.9 Modeller evaluation The accuracy of the models produced Modeller has been evaluated at both meetings on "Critical Assessment of Protein Structure Prediction Methods" (CASP) [Mosimann et al., 1995, Dunbrack Jr. et al., 1997]. Modeller -2 was tested at CASP1 [Sali et al., 1995]; Modeller -3 was tested at CASP2 [Sanchez & Sali, a]. Accuracy of models produced by Modeller -4 is similar to that of Modeller -3. This is a summary: The models have good stereochemistry. In terms of overall root-mean-square deviation, a model is generally slightly closer to the actual structure than the closest template structure, especially when the correct alignment and multiple templates are used. The largest errors occur in the regions that are not aligned correctly or where the template structures are not similar to the correct structure. These regions correspond predominantly to exposed loops, insertions of any length, and non-conserved sidechains. However, when a template structure with more than 40% sequence identity to the target protein is available, the model is likely to have about 90% of the mainchain atoms modeled with an Rms deviation from the X-ray structure of 1A , in large part because the templates are likely to be that similar to the X-ray structure of the target. This Rms deviation is comparable to the overall differences between the refined NMR and X-ray crystallography structures of the same protein. 1.10 Modeller benchmarks The speed of Modeller -4 is benchmarked as a function of the number of templates, target sequence length, and computer type (Figure 1.4, Table 1.1). In all benchmarks, the default degree of optimization, which includes molecular dynamics refinement, is used (LIBRARY_SCHEDULE = 4, FINISH_METHOD = 'refine3'). About 60-80% of CPU time is spent on the molecular dynamics refinement stage. This stage can be omitted if some restraint violations are acceptable and saving CPU-time is more important than completely refining the model. Figure 1.4: CPU-time for Modeller -4 for different modeling cases. (a) CPU-time as a function of the number of templates. Two elastase models (240 residues) are calculated per each run. The CPU time shown is the total user time reported by the Unix /bin/time command divided by two. (b) CPU-time as a function of the number of residues in the target sequence. One template is used in all_calculations._One_model_is_calculated_per_each_run._ 1.10. MODELLER BENCHMARKS 21 ________________________________________________* *_________________________@ COMPANY COMPUTER * * OS @ __________________PROCESSOR_SPEED_______________* *_________________________@ SGI O2000-4MB, 195MHz * * IRIX 6.4 @ SGI R10000, 195MHz * * IRIX 6.2 @ SGI R8000, 75MHz * * IRIX 6.1 @ SGI R4400, 200MHz * * IRIX 5.3 @ IBM RS6000/340 * * AIX 3.2 @ Gateway PentiumPro PC, i686-200MHz * * Linux 2.0.18 @ DEC Alpha 3000-600, 375MHz * * OSF/1 V3.2 @ DEC Alpha 3000-600, 275MHz * * OSF/1 V3.2 @ Sun SPARC20/712MP, 75MHz * * SunOS 5.4 @ HP 9000/735, 90MHz * * HP-UX A.09.05 @ NeXT NeXTstation, 25MHz * * NeXTStep 3.3 @ __NEC_____________Versa_6200MX,_i586-166MHz_____* *___________Linux_2.0.27__@ Table 1.1: CPU-time for Modeller -4 on various computers. * *The benchmark job is 'exa@ a single comparative model for a 54-residue ferredoxin from a s* *ingle template structure.@ `user' CPU time reported by the /bin/time command is given. Th* *e SIZE column refers to t@ 1000 residues and 40 proteins; SMALL, 1000 residues and 5 prote* *ins; TINY, 500 residues a@ comparative modeling case. The job 'examples/tutorial-model/mod* *el-fast.top'_is_typically@ 22 CHAPTER 1. INTRODUCTION 1.11 Modeller updates This is the list of the major changes since release of Modeller -4 on 17 June, 1997. For an incomplete list of the changes from 17 March 1994 to the release of Modeller -4, see file 'doc/changes-pre-modeller-5'. Chapter 2 Modeller commands Sections in this Chapter describe technical aspects of Modeller . They include: o miscellaneous rules and features of Modeller (Section 2.1); o dealing with stereochemical parameters and molecular topology (Section 2.2); o handling of atomic coordinates (Section 2.3); o comparing and searching of sequences and structures (Section 2.4); o calculating spatial restraints (Section 2.5), o deriving the model by minimizing the restraints (Section 2.6). 2.1 Miscellaneous rules and features of Modeller This Section describes several features of the program, including file naming conventions, various file types, and the control of the amount of output. 2.1.1 Running Modeller scripts Modeller is run by mod script_file_name where script_file_name is the name of the script file with instructions for Modeller . This file contains contains commands in the Top language. Each command line consist of the name of the command and optional variable assignments that control the action of the command. The scope of the variables is global; that is, once a variable is assigned on any command line, the assigned values remain in effect, in the main program and all subroutines, until explicitly changed by another assignment. All the commands and the default values of the variables are listed in Section 6.4. This Chapter describes the Top commands that are used for dealing with proteins; the general Top commands (e.g., assignment, flow control, arithmetic operations) are described in Chapter 5. See directory examples for examples of the Top scripts that use commands described in this Chapter. In particular, sub-directory examples/commands contains the examples used in this Chapter. Another set of Top scripts that you could use as templates can be found in the scripts directory. 2.1.2 Controlling breakpoints and the amount of output Some errors are recoverable. For those errors, Top variable MODELLER_STATUS becomes 1. A test is then performed: If MODELLER_STATUS is equal or greater then STOP_ON_ERROR, execution stops; otherwise, the 23 24 CHAPTER 2. MODELLER COMMANDS control is passed back to the calling Top routine where execution continues with the next Top command. It is then up to your Top script to deal sensibly with the failure of the preceding command. For example, this flexibility allows derivation of multiple models and searching for many sequences, even if some cases abort due to array size or convergence problems. There are four kinds of messages that Modeller writes to the log file, indexed 1 to 4: long output from the Modeller commands, short notes to do with the execution of the program (files opened, etc.), warnings identified by `_W>', and errors identified by `_E>'. The four elements in the Top variable OUTPUT_CONTROL[1:4] can assume values of 0 or `not 0'; 0 indicates that the corresponding information is not written out, `not 0' indicates that it is.1 Thus, different amounts of output can be selected. If everything is well, OUTPUT_CONTROL = 1 0 0 1 is convenient because no execution messages and warnings are written out; for debugging, use OUTPUT_CONTROL = 1 1 1 1. 2.1.3 File naming There are several filename generating mechanisms that facilitate file handling. Not all of them apply to all filename types. Environment variables There can be Unix shell environment variables in any input or output filename. The environment variables have to be in the format ${VARNAME} or $(VARNAME). Also, four predefined macros are available for string variables: o ${LIB} is expanded into $LIB_MODELLER4 shell environment variable (equal to $MODINSTALL/modlib); o ${DIR} is expanded into the Top variable DIRECTORY; o ${JOB} is expanded into the root of the Top script filename; o ${DEFAULT} is expanded into (ROOT_NAME)(FILE_ID)(ID1)(ID2)(FILE_EXT), where ROOT_NAME, FILE_ID, ID1, ID2, and FILE_EXT are Top variables. FILE_ID is a string that may be set to 'default'. In that case, a hard-wired short string is used instead of FILE_ID. Otherwise, the explicitly specified FILE_ID is applied instead. In any case, FILE_ID is not modified by the filename generation routine so that it can be used more than once without resetting it to the 'default' value. Four digits are used for both ID1 and ID2. For example, '2ptn.B99990001' results from ROOT_NAME = '2ptn', FILE_EXT = '.B', ID1 = 9999, and ID2 = 1. Automatic filename generation For any filename, input or output, except for MDT_LIB_FILE and BIN_LIB_FILE, if the value of the variable is 'default' (case insensitive), the actual filename is constructed within the routine that will use the filename. The name is constructed by the same rule as the ${DEFAULT} environment variable (Section 2.1.3). The only difference between the two cases is that SET FILE = 'default' may not work as expected if the Top variables defining the filename change between the SET command and the command that will use the filename, whereas SET FILE = '${DEFAULT}' will work as expected because the filename FILE is actually constructed during the SET command.2 Directory prefixes Input. For many input filenames, the full filename is obtained by looking for the file in the list of directories specified in the Top variable DIRECTORY. The directories in DIRECTORY are separated by colons (':') (e.g., `dir1:dir2:dir3:...'). DIRECTORY can also contain the current directory (` ' or `./'). The directory prefix for the input atom coordinate filenames is obtained in a similar way, except that ATOM_- FILES_DIRECTORY is used instead of DIRECTORY. Moreover, there is an additional mechanism for reading an 1___________________________________________________________ 2This has not been implemented for all the output yet. The 'default' substitution will be phased out because it is a subset of the ${DEFAULT} substitution. 2.1. MISCELLANEOUS RULES AND FEATURES OF MODELLER 25 atom coordinate file which reqiures specifying the protein code only (see below in Section on coordinate files and derivative data). The list of directories is not scanned for MDT_LIB_FILE, BIN_LIB_FILE and input filenames that start with '/'. In contrast, the INCLUDE_FILE file is looked for in the distribution's $BIN_MODELLER4 directory (equal to $MODINSTALL/bin directory) in addition to the DIRECTORY directories. This allows for an easy inclusion of the predefined system '__*.top' files by the INCLUDE command. Output. For all output filenames, except for MDT_LIB_FILE, BIN_LIB_FILE, and those filenames that start with '/', the full output filename is obtained by pre-fixing the filename with OUTPUT_DIRECTORY. Coordinate files and derivative data When accessing an atom file, a specified filename is tried first. If this is unsuccessful, Modeller automatically expands the original filename by adding extension '.Z'. This allows it to detect atom files compressed with the Unix compress command. If the compressed file exists, Modeller automatically uncompresses it and puts it back into the original state after the reading is finished. If the specified file is still not found, the extensions '.atm', '.pdb', '.ent', and '.crd' are tried in this order, without and with extension '.Z', then also with the 'pdb' prefix. This search for the atom file is repeated through all the directories in ATOM_FILES_DIRECTORY (directories are separated by ':'), unless input atom filename starts with '/', in which case ATOM_FILES_DIRECTORY is neglected. Finally, if still unsuccessful and the file specified by the environment variable $PDBENT exists, the coordinate filename (e.g., the 4 character PDB code) is matched to the list of the full PDB filenames in $PDBENT (compressed and uncompressed). For example, $PDBENT file may be: /disk2/pdb/pdb.pdb.bnl.gov/all_entries/uncompressed_files/pdb1ema.ent /disk2/pdb/pdb.pdb.bnl.gov/all_entries/uncompressed_files/pdb1hbp.ent /disk2/pdb/pdb.pdb.bnl.gov/all_entries/uncompressed_files/pdb1gpy.ent /disk2/pdb/pdb.pdb.bnl.gov/all_entries/uncompressed_files/pdb6gpb.ent /disk2/pdb/pdb.pdb.bnl.gov/all_entries/uncompressed_files/pdb1fia.ent etc. Any derivative data that Modeller may need, including residue solvent accessibilities, hydrogen bonding information, dihedral angles, residue neighbors, etc ., are calculated on demand from the atomic coordinates. The most time consuming operation is calculating solvent accessibility, but even this takes only about 1 sec for a 200 residue protein on an SGI Impact Indigo workstation. 2.1.4 File types Modeller uses a number of standard filename extensions to indicate the type of data stored in a file (Table 2.1). The extensions are generally not mandatory, only very helpful. 2.1.5 Format of the command description For each command, the list of arguments, brief description, and an example are given. Additional background information may be found in Chapter 6. The variable types are described as follows (see also Table 5.1): an integer variable or constant a real variable or constant a string variable or constant a logical variable or constant a vector of any length with elements a vector of N elements etc. the same for real, string, and logical types 26 CHAPTER 2. MODELLER COMMANDS _____________________________________________________________________________________________________ __Extension_______________Description________________________________________________________________ .top TOP script with instructions for a Modeller job .log log output produced by a Modeller run .ali alignment or sequences in the PIR format .pap alignment or sequences in the PAP format .aln alignment or sequences in the Quanta format .aln alignment or sequences in the InsightII format .seq, .chn sequence(s) in the PIR alignment format .cod list of sequence codes .grp list of families in PDB .atm, .pdb, .ent atom coordinates in the PDB or Grasp format .crd atom coordinates in the Charmm format .fit fitted protein structures in the PDB format .ini initial Modeller model .B* Modeller model in the PDB format .D* the progress of optimization .V* violations profile .E* energy profile .rsr restraints in MODELLER or USER format .sch schedule file for the variable target function optimization .mat matrix of pairwise protein distances from an alignment .mat matrix of pairwise residue type-residue type distance scores .sim.mat matrix of pairwise residue type-residue type similarity scores .lib various Modeller libraries .psa residue solvent accessibilities .sol atomic solvent accessibilities .ngh residue neighbors .dih mainchain and sidechain dihedral angles .var sequence variability profile from multiple alignment __.asgl___________________data_for_plotting_by_Asgl__________________________________________________ ___________Table_2.1:_List_of_file_types.______________ 2.2. STEREOCHEMICAL PARAMETERS AND MOLECULAR TOPOLOGY 27 2.2 Stereochemical parameters and molecular topology All molecular modeling programs generally need to know what are the atoms in all residue types, what are the atom pairs that are covalently bonded to each other (i.e., molecular topology), and what are the ideal bond lengths, angles, dihedral angles, and improper dihedral angles (i.e., internal coordinates and stereochemical restraints). For a given MODEL, these data are constructed mostly from information in the residue topology and parameter libraries. This section describes the commands for reading and writing parameter and residue topology libraries, and for generating, patching, and mutating molecular topology. 2.2.1 Modeling residues with non-existing or incomplete entries in the topology and parameter libraries Defining new residue types is generally one of the more painful areas in developing and using a molecular modeling program. Modeller has two quick-and-dirty solutions described in the next two sections that are often sufficient for comparative modeling involving new residue types. On the other hand, if you are willing to spend some time and define a new entry or complete an incomplete entry in the residue topology or parameter libraries, see Chapter 4, Question 18. Residues with defined topology, but with missing parameters The parameter library is used by the MAKE_RESTRAINTS command to construct bond, angle, dihedral angle, improper dihedral angle, and non-bonded Lennard-Jones restraints. If some parameters for these restraints are missing, they are guessed on the fly from the current Cartesian coordinates of the MODEL. Thus, when there are missing parameters, the MODEL coordinates must be defined before calling MAKE_RESTRAINTS. The coordinates can be defined by the BUILD_MODEL command (from the IC entries in the residue topology library), by the READ_MODEL command (from an existing coordinate file for MODEL), or by the TRANSFER_XYZ command (from template coordinate files aligned with MODEL). The bonds, angles, and improper dihedral angles are restrained by a harmonic potential with the mean equal to the value in the current structure and a force constant typical for chemical bonds, angles, and improper dihedral angles, respectively. The dihedral angles are restrained by a tri-modal cosine term with the mean equal to the angle in the current structure. A message detailing Modeller 's improvization is written to the log file. Block (BLK) residues with undefined topology and parameters The second relatively easy way of dealing with missing entries in the residue topology and/or parameters libraries is to use a "block" residue. These residues are restrained more or less as rigid bodies to the conformation of the equivalent residue(s) in the template(s). No chemical information is used. The template residues can themselves be defined as block residues. The symbol for the block residues is `BLK' in the four- and three-letter codes and `.' in the single-letter code. The atoms in a BLK residue include all uniquely named atoms from the equivalent residues in all the templates. The atom type of all BLK atoms is the Charmm type `undf'. The IUPAC atom names (as opposed to the atom types) are the same as in the templates. The `undf' atom type for all BLK atoms facilitates using the PICK_ATOMS command for generating restraints restraining `BLK' residues. The `undf' atoms are treated differently from the other atoms during preparation of dynamic restraints: No pairs of intra-BLK atoms are put on the dynamic non-bonded list. Only the "inter-BLK" atom pairs and "BLK- other" atom pairs are considered for the dynamic non-bonded restraints. The radius of all block atoms is obtained from the $RADII_LIB library using the block atom names (as written out to a PDB file), not the `undf' atom type. All intra-BLK and inter-residue BLK restraints other than the non-bonded restraints have to be derived separately and explicitly by MAKE_RESTRAINTS command using RESTRAINT_TYPE = 'distance'. See script scripts/__homcsr.top for the routine that makes block restraints for comparative modeling by the `model' script. Lennard-Jones, electrostatic, and general non-bonded spline terms involving `undf' atoms are ignored by Modeller . For an example of how to use block residues, see Chapter 4, Question ?? . 28 CHAPTER 2. MODELLER COMMANDS 2.2.2 READ__TOPOLOGY _ read residue topology library Options: FILE = 'default' partial or complete filename DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') ADD_TOPOLOGY = off whether to add new residue topologies to existing ones Description: This command reads residue topologies from the topology library, such as the Charmm 22 topology file [Brooks et al., 1983]. This file must include atomic connectivities of residues and patching residues, and the internal coordinates for minimum energy residue conformations. Patching residues modify residues; for example, N-terminus, C-terminus and disulfide bonds are defined by patching the original topology. This in- formation is used for generating the molecular topology and possibly for calculating an initial conformation. The default topology for comparative modeling by Modeller includes only non-hydrogen atoms (TOPOL- OGY_MODEL = 3). To define your entries in the topology library, see Chapter 4, Questions 18 and 19. If ADD_TOPOLOGY is on the new residue topologies are added to the existing residue topologies, otherwise the new topology file replaces the old one. If the topology for a residue is duplicated only the last definition is kept. Not all the features of the Charmm 22 topology library are implemented in Modeller , although a Charmm file should be read in successfully. A variety of topology files for different kinds of models can be prepared by the MAKE_TOPOLOGY_MODEL command. The filename for the library is DIRECTORY/KFILE. Example: See PATCH command. 2.2.3 READ__PARAMETERS _ read parameters library Options: FILE = 'default' partial or complete filename DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') ADD_PARAMETERS = off whether to add new parameters to existing ones Description: This command reads the parameters from the parameter library, such as the Charmm 22 parameter file for proteins with all atoms [Brooks et al., 1983]. This file contains the values for bond lengths, angles, dihedral angles, improper dihedral angles, and non-bonded interactions. Modeller relies on slightly modified Charmm -22 parameters to reproduce the protein geometry in the Modeller environment. For example, for the default non-hydrogen atoms model, the ! dihedral angle restraints are stronger than the original Charmm 22 values which apply to the all-hydrogen model. For a sparse discussion of the parameter library, see Chapter 4, Question 18. If ADD_PARAMETERS is on the new parameters are added to the existing parameter list, otherwise the contents of the new parameter file replaces the old one. The filename for the library is DIRECTORY/FILE. Example: See PATCH command. 2.2.4 READ__ATOM__CLASS _ read classification of atom types Options: ATOM_CLASSES_FILE = '$(LIB)/atmcls-168.lib' library with atom class definitions for MOD- ELLER non-bonded restraints 2.2. STEREOCHEMICAL PARAMETERS AND MOLECULAR TOPOLOGY 29 Description: This command reads a classification of atom types from file ATOM_CLASSES_FILE. This classification is used for calculation of non-bonded spline restraints, an experimental feature of Modeller so far. The command is used to read the matching atom classification for a given non-bonded parameters library, if the default library, which is read as part of initialization, is not appropriate. Example: See ENERGY_PROFILE command. 2.2.5 GENERATE__TOPOLOGY _ generate MODEL topology Options: ADD_SEGMENT = off whether to add the new segments to the list of segments PATCH_DEFAULT = on whether to do default NTER and CTER patching ALIGN_CODES = 'all' codes of proteins in the alignment SEQUENCE = 'undefined' protein code in the alignment whose topology is constructed Requirements: topology and parameter libraries Description: This command calculates MODEL's covalent topology (i.e., atomic connectivity) and internal coor- dinates, and assigns Charmm atom types, Modeller atom types for non-bonded spline restraints, atomic charges, and atomic radii. If a protein with code SEQUENCE is found in the current alignment (codes of proteins in the current alignment are stored in ALIGN_CODES), this protein's topology is calculated. If no SEQUENCE entry exists or if the alignment does not exist, the sequence of the MODEL is used. If the MODEL does not exist, an error is reported. The MODEL can be read in from an atomic coordinates file with the READ_MODEL command. The new sequence is added to the list of segments of the MODEL if ADD_SEGMENT is on, otherwise this list is initiated. A sequence in the alignment can use any non-patching residue listed in the single-character code column of the $RESTYP_LIB library ('modlib/restyp.lib'). Examples of non-standard residue types include water ('w'), zinc ('z'), calcium ('3'), heme ('h'), and many others. Patching residues must not be used here, but with the subsequent PATCH commands. Unrecognized residues are ignored. A special allowed residue type is the chain break `/'. This can be used to construct a protein that consists of several chains separated by chain breaks. Chain breaks before a non-standard residue type (there are 23 standard residue types, including '-', 'Asx' and 'Glx') are inserted automatically and do not have to be specified explicitly in the sequence. The GENERATE_TOPOLOGY command generates only the topology of the MODEL, not its Cartesian coordinates; the Cartesian coordinates are assigned by the BUILD_MODEL, TRANSFER_XYZ, or READ_MODEL commands. In general, the GENERATE_TOPOLOGY command has to be executed before any energy commands (ENERGY, OPTIMIZE, ENERGY_PROFILE, PICK_HOT_ATOMS). The reason is that reading the Cartesian coordinates by the READ_MODEL command does not generate all the data usually needed for energy evaluation. However, if the order and number of atoms in the input file corresponds exactly to the order and number of atoms implied by the restraint atom indices and if you are not using dynamic restraints that rely on non-existing data, such as bond, angle, and dihedral angle lists, atomic charges, radii, Lennard- Jones parameters, Modeller atom types, or Charmm atom types (which are used to determine the atomic radii), it is sufficient to do only READ_MODEL and omit GENERATE_TOPOLOGY before the energy commands. In short, if you use static restraints alone and if the atom file has the atoms in the correct order, you do not have to call GENERATE_TOPOLOGY before calculating energy. Example: See PATCH command. 30 CHAPTER 2. MODELLER COMMANDS 2.2.6 PATCH _ patch MODEL topology Options: RESIDUE_IDS = '' identifiers of the patched residues RESIDUE_TYPE = 'undefined' patching residue type Description: This command uses a Charmm patching residue to patch the topology of the MODEL. Charmm patch rules are closely observed. RESIDUE_TYPE is the type of the patching residue (PRES entry in the topology library), such as 'DISU', 'NTER', 'CTER', etc . You do not have to apply explicitly the N- and C-terminal patches to protein chains because the 'NTER' and 'CTER' patches are applied automatically to the appropriate residue types at the termini of each chain at the end of each GENERATE_TOPOLOGY command. RESIDUE_IDS are residue identifiers of the patched residues (Section 2.4.1). The first residue is the patched residue 1, the second residue is the patched residue 2, etc ; for example, the 'DISU' patching residue has two patched Cys residues while the 'ACE' patching residue has only one patched residue. The order of the residue identifiers here has to match the definition of the patching residue in the topology library. It is not allowed to patch an already patched residue. Since the N- and C-terminal residues of each chain are automatically patched with the 'NTER' and 'CTER' patching residues, respectively, a user who wants to patch the N- or C-terminal residues with other patches, should turn the default patching off before executing GENERATE_TOPOLOGY. This is achieved by SET PATCH_DEFAULT = off. Example: # Example for: PATCH, READ_TOPOLOGY, READ_PARAMETERS # This will define a CYS-CYS disulfide bond between residues 3 and 22. READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' # Read the sequence: READ_MODEL FILE = '1fas' # have two copies of the sequence in the alignment, for TRANSFER_XYZ later: SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' # Create the disulfide bond: PATCH RESIDUE_TYPE = 'DISU', RESIDUE_IDS = '3' '22' # Get MODEL's coordinates from the template, using the alignment (1:1 here): TRANSFER_XYZ # Calculate missing coordinates using internal coordinates: BUILD_MODEL INITIALIZE_XYZ = off # Create the stereochemical restraints MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' # Calculate the energy to test the disulfide: ENERGY 2.2. STEREOCHEMICAL PARAMETERS AND MOLECULAR TOPOLOGY 31 2.2.7 PATCH__DISULFIDES _ guess MODEL disulfides from templates Options: ATOM_FILES = '' complete or partial atom filenames Requirements: alignment Description: This command defines and patches disulfide bonds in the MODEL using an alignment of the MODEL sequence with one or more template structures. The MODEL sequence has to be the last sequence in the alignment. The template structures are all the other proteins in the alignment. All Cys-Cys pairs in the target sequence that are aligned with at least one template disulfide are defined as disulfide bonds themselves. The covalent connectivity is patched accordingly. If no alignment exists, a default 1:1 alignment is constructed. Variable ATOM_FILES can be used to specify template structures. This command should be run after GENERATE_TOPOLOGY and before MAKE_RESTRAINTS to ensure that the disulfides are restrained properly by the bond length, angle, and dihedral angle restraints and that no SG-SG non-bonded interactions are applied. The disulfide bond, angle and dihedral angle restraints have their own physical restraint type separate from the other bond, angle and dihedral angle restraints (Table 2.4). Example: # Example for: PATCH_DISULFIDES # This will patch CYS-CYS disulfide bonds using disulfides in aligned templates: READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' # Read the sequence, calculate its topology, and coordinates: READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '2ctx' '2abx' # Superpose the two template structures without changing the alignment. # This is for TRANSFER_XYZ to work properly. It relies on not reading # the atom files again before TRANSFER_XYZ. MALIGN3D FIT = off # This is for TRANSFER_XYZ to work properly. READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = ALIGN_CODES '1fas' GENERATE_TOPOLOGY SEQUENCE = '1fas' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = on # Create the disulfide bonds using equivalent disulfide bonds in templates: PATCH_DISULFIDES # Create the stereochemical restraints MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' # Calculate energy to test the disulfide restraints (bonds, angles, dihedrals): ENERGY 2.2.8 MUTATE__MODEL _ mutate selected MODEL residues Options: RESIDUE_TYPE = 'undefined' new residue type 32 CHAPTER 2. MODELLER COMMANDS Description: This command mutates the selected residues of the MODEL to the type specified by RESIDUE_- TYPE. Charmm 4-character residue type names are used (see library file $RESTYP_LIB). To select the residues for mutation, use PICK_ATOMS command. All the residues with at least one atom in the selected set 1 of atoms are selected. To produce mutants, employ this command with SEQUENCE_TO_ALI and WRITE_ALIGNMENT. It is usually necessary to write the mutated sequence out and read it in before proceeding, because not all sequence related information about MODEL is changed by this command (e.g., internal coordinates, charges, and atom types and radii are not updated). Example: # Example for: MUTATE_MODEL # This will read a PDB file, change its sequence a little, build new # coordinates for any of the additional atoms using only the internal # geometry, and write the mutant PDB file. It can be seen as primitive, # but rapid comparative modeling for substitution mutants. For insertion # and deletion mutants, follow the standard comparative modeling procedure. # Read the topology library with non-hydrogen atoms only: READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib', TOPOLOGY_MODEL = 3 # To produce a mutant with all hydrogens, uncomment this line: # READ_TOPOLOGY FILE = '$(LIB)/top.lib', TOPOLOGY_MODEL = 1 # Read the CHARMM parameter library: READ_PARAMETERS FILE = '$(LIB)/par.lib' # Read the original PDB file and copy its sequence to the alignment array: READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = '1fas', ALIGN_CODES = '1fas' # Select the residues to be mutated: in this case all ASP residues: PICK_ATOMS RES_TYPES = 'ASP' # The second example is commented out; it selects residues '1' and '10'. # SET SELECTION_SEARCH = 'SEGMENT', SELECTION_FROM = 'ALL' # PICK_ATOMS SELECTION_SEGMENT = '1' '1', SELECTION_STATUS = 'INITIALIZE' # PICK_ATOMS SELECTION_SEGMENT = '10' '10', SELECTION_STATUS = 'ADD' # Mutate the selected residues into HSD residues (neutral HIS): MUTATE_MODEL RESIDUE_TYPE = 'HSD' # Add the mutated sequence to the alignment arrays (it is now the second # sequence in the alignment): SEQUENCE_TO_ALI ADD_SEQUENCE = on, ALIGN_CODES = ALIGN_CODES '1fas-1' # Generate molecular topology for the mutant: GENERATE_TOPOLOGY SEQUENCE = '1fas-1' # Transfer all the coordinates you can from the template native structure # to the mutant (this works even if the order of atoms in the native PDB # file is not standard): TRANSFER_XYZ # Build the remaining unknown coordinates for the mutant: 2.2. STEREOCHEMICAL PARAMETERS AND MOLECULAR TOPOLOGY 33 BUILD_MODEL INITIALIZE_XYZ = off # Write the mutant to a file: WRITE_MODEL FILE = '1fas-1.atm' 2.2.9 MAKE__TOPOLOGY__MODEL _ make a subset topology library Options: TOPOLOGY_MODEL = 3 selects topology library: 1-9 Description: This command makes a residue topology library from the most detailed Charmm topology library, which contains all atoms, including all hydrogens (corresponding to TOPOLOGY_MODEL = 1). There are currently nine residue topologies, all of which are defined in library $MODELS_LIB. For example, the default non-hydrogen atom topology is selected by TOPOLOGY_MODEL = 3. For each TOPOLOGY_MODEL and residue type, the $MODELS_LIB library lists those atoms in the full atom set that are part of the specified topology. This command works by deleting all the entries that contain non-existing atoms from the original topology file. One must carefully test topology files produced in this way. Library $RADII_LIB must specify atomic radii for each atom in each residue type for each topology model. TOPOLOGY_MODEL must be an integer from 1 to 9. For more information about the topology library, see Chapter 4, Questions 18 and 19. Example: # Example for: MAKE_TOPOLOGY_MODEL, WRITE_TOPOLOGY_MODEL # This creates a topology library for heavy atoms from the # CHARMM all-atom topology library: # Read CHARMM all-atom topology library: READ_TOPOLOGY FILE = '$-LIB"/top.lib' # Keep only heavy atoms (TOPOLOGY_MODEL = 3) MAKE_TOPOLOGY_MODEL TOPOLOGY_MODEL = 3 # Write the resulting topology library to a new file: WRITE_TOPOLOGY_MODEL FILE = 'top_heav.lib' 2.2.10 WRITE__TOPOLOGY__MODEL _ write residue topology library Options: FILE = 'default' partial or complete filename OUTPUT_DIRECTORY = '' output directory Description: This command writes a residue topology library to the specified file. It is usually used after MAKE_TOPOLOGY_MODEL. Example: See MAKE_TOPOLOGY_MODEL command. 34 CHAPTER 2. MODELLER COMMANDS 2.3 Handling of atomic coordinates This section describes commands for dealing with Cartesian coordinates of a 3D model: for reading, writing, creating and transforming them. 2.3.1 READ__MODEL _ read coordinates for MODEL Options: FILE = 'default' name of the coordinates' file ATOM_FILES_DIRECTORY = './' input atom files directory list (e* *.g., 'dir1:dir2:dir3:./:/') MODEL_SEGMENT = '@:@' 'X:X' segment to be read in MODEL_FORMAT = 'PDB' selects input atom file format: 'PDB' _ 'CHARMM' _ 'UHBD' WATER_IO = off whether to read water coordinates HETATM_IO = off whether to read HETATM coordinates HYDROGEN_IO = off whether to read hydrogen coordinates Description: This command reads the atomic coordinates, atom names, residue names, residue numbers, isotropic temperature factors and segment specifications for MODEL, assigns residue types, and defines the dihedral angles listed in the $RESDIH_LIB library. For CHARMM and UHBD file formats, it also reads the atomic charges. However, it does not calculate Charmm and Modeller atom types, internal coordinates, charges (in the case of the 'PDB' format), or patches (such as disulfides); to do this, which is necessary for almost all energy commands, use GENERATE_TOPOLOGY. All real and pseudo atoms are selected. The PDB residue type 'HIS' is assigned the Charmm residue type 'HSD', which is the neutral His with H on ND1. The PDB types 'ASP' and 'GLU' are assigned the corresponding charged Charmm residue types, as are 'LYS' and 'ARG'. These conventions are relevant only if electrostatic terms and/or hydrogens are used. MODEL_SEGMENT sets the beginning and ending residue identifiers for the contiguous sequence of residues to be read from the PDB file (this option does not work yet for the other file formats). The format of residue identifiers is described in Section 2.4.1. In addition, the following rule applies: If there is no `:' in the first residue identifier, it is assumed that you specified a protein code in the current alignment from which the segment specification is then taken. Similarly, if there is no `:' in the second residue identifier, it is assumed that you specified a protein code in the current alignment from which the PDB filename is then taken. The two codes do not have to be the same. Example: # Example for: READ_MODEL, WRITE_MODEL # This will read a PDB file and write a CHARMM atom file without atomic charges # or radii. For assigning charges and radii, see the all_hydrogen.top script. READ_MODEL FILE = '1fas' WRITE_MODEL FILE = '1fas.crd', MODEL_FORMAT = 'CHARMM' 2.3.2 READ__MODEL2 _ read coordinates for MODEL2 Options: FILE = 'default' name of the coordinates' file ATOM_FILES_DIRECTORY = './' input atom files directory list (e* *.g., 'dir1:dir2:dir3:./:/') 2.3. HANDLING OF ATOMIC COORDINATES 35 MODEL2_SEGMENT = '@:@' 'X:X' segment to be read in MODEL_FORMAT = 'PDB' selects input atom file format: 'PDB' _ 'CHARMM' _ 'UHBD' WATER_IO = off whether to read water coordinates HETATM_IO = off whether to read HETATM coordinates HYDROGEN_IO = off whether to read hydrogen coordinates Description: This command reads a coordinate file for MODEL2. See the description of the READ_MODEL command for more information. It is used in conjunction with the SUPERPOSE, TRANSFER_RES_- NUMB, REORDER_ATOMS and some other commands, as well as for changing the format of the atom file. Example: See READ_MODEL command. 2.3.3 WRITE__MODEL _ write MODEL Options: FILE = 'default' name of the coordinates' file OUTPUT_DIRECTORY = '' output directory MODEL_FORMAT = 'PDB' selects output atom file type: 'PDB' _ 'CHARMM' _ 'UHBD' _ 'GRASP' WRITE_ALL_ATOMS = on whether to write all atoms, even if unselected Requirements: MODEL Description: This command writes the current MODEL to a file in the selected format. If the file format is 'PDB', only the selected atoms are written out when WRITE_ALL_ATOMS = off; otherwise all atoms are written out. The 'GRASP' format is the same as the 'PDB' format, except that it includes two special lines at the top of the file and the atomic radii and charges in the columns following the Cartesian coordinates of atoms. This format is useful for input to program Grasp , written by Anthony Nicholls in the group of Barry Honig at Columbia University [Nicholls et al., 1991]. Example: See READ_MODEL command. 2.3.4 WRITE__MODEL2 _ write MODEL2 Options: FILE = 'default' name of the coordinates' file OUTPUT_DIRECTORY = '' output directory MODEL_FORMAT = 'PDB' selects output atom file type: 'PDB' _ 'CHARMM' _ 'UHBD' _ 'GRASP' Requirements: MODEL2 Description: This command writes MODEL2 to a file in the selected format. It is used in conjunction with the SUPERPOSE, TRANSFER_RES_NUMB, REORDER_ATOMS and some other commands, as well as for changing the format of the atom file. Example: See READ_MODEL command. 36 CHAPTER 2. MODELLER COMMANDS 2.3.5 BUILD__MODEL _ build MODEL coordinates from topology Options: INITIALIZE_XYZ = on whether to use IC entries to calculate all coordi- nates RAND_SEED = -8123 random seed from -50000 to -2 BUILD_METHOD = '3D_INTERPOLATION' method for building coordinates: 'INTERNAL_COORDINATES' _ '3D_INTERPOLATION' Requirements: topology file & parameters file & MODEL topology Description: This command builds Cartesian coordinates of the MODEL. If INITIALIZE_XYZ is on, all coordinates are built. Otherwise only the undefined coordinates are built. The latter is used because some coordinates may be undefined after the TRANSFER_XYZ command. The undefined coordinates have a value of -999: when written to a PDB file. If BUILD_METHOD is 'INTERNAL_COORDINATES', the Cartesian coordinates will be built from the ideal values of the internal coordinates as obtained from the IC entries in the residue topology library. If an appropriate IC entry does not exist, the ideal value of the internal coordinate is calculated from the corresponding energy term in the parameter library. If some coordinates still cannot be built, they will be set to values close to those of the neighboring atoms. If even this fails, they will be set randomly. If BUILD_METHOD is '3D_INTERPOLATION', the Cartesian coordinates will be built by linearly interpolating between two defined spanning atoms. If any of the spanning atoms is not defined, a vector from the gravity center to the opposite spanning atom is defined. The spanning atom position is then defined by extending this vector from the opposite spanning atom for 0.4 times the number of inserted atoms, in angstroms. If there is no opposite spanning atom, the spanning atom is defined randomly. Once the two spanning positions are obtained, the coordinates for the intervening atoms, all of which are undefined, are defined by equidistant linear interpolation within the interval spanned by the two spanning positions. Note that in this mode, both the mainchain and sidechain conformations of all inserted residues are random and distorted. However, this particular build-up mode may sometimes eliminate a knot and minimize the extended nature of the insertion obtained by BUILD_METHOD = 'INTERNAL_COORDINATES'. Example: # Example for: BUILD_MODEL # This will build a model for a given sequence in an extended conformation. READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' # Read the sequence from a file (does not have to be part of an alignment): READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '1fas' # Calculate its molecular topology: GENERATE_TOPOLOGY SEQUENCE = '1fas' # Calculate its Cartesian coordinates using internal coordinates and # parameters if necessary: BUILD_MODEL INITIALIZE_XYZ = on # Write the coordinates to a PDB file: WRITE_MODEL FILE = '1fas.ini' Example: 2.3. HANDLING OF ATOMIC COORDINATES 37 # Example for: GENERATE_TOPOLOGY, BUILD_MODEL # This will read a specified atom file, generate all hydrogen atoms, # add atomic radii and charges, and write the model to a PDB file in # the GRASP format. This can be used with GRASP to display electrostatic # properties without assigning charges and radii in GRASP. READ_TOPOLOGY FILE = '$(LIB)/top.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' SET TOPOLOGY_MODEL = 1 READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas' # Have to patch the topology here to remove sulfhydril hydrogens: PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '17' '39' PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '3' '22' PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '53' '59' PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '41' '52' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off WRITE_MODEL FILE = '1fas.ini', MODEL_FORMAT = 'GRASP' 2.3.6 TRANSFER__XYZ _ copy templates' coordinates to MODEL Options: CLUSTER_CUT = -1.0 definition of a cluster Requirements: alignment and MODEL Description: This command transfers coordinates of the equivalent atoms and their isotropic temperature factors from the template structures to MODEL. The alignment has to be in memory. The target sequence is the last protein in the alignment and has to be the same as the MODEL sequence. The template structures are all the other proteins in the alignment. Before transferring coordinates, the template structures generally have to be explicitly least-squares super- posed onto each other. This is most conveniently achieved with the MALIGN3D command called just before TRANSFER_XYZ. This is an important difference relative to Modeller -3, which did not require explicit superposition by the user. Note, however, that the 'model' script does this superposition automatically. If CLUSTER_CUT is less than 0, the transferred coordinates for a given target atom are the average of the coordinates of all the equivalent template atoms. Otherwise, the transferred coordinates are the average of the templates in the largest cluster of the atoms. This cluster is obtained as follows (it only works when all templates and the target have exactly the same topology): For each residue, calculate pairwise RMS distances between all pairs of template residues. Use the weighted pair-group average clustering method (the same as in the DENDROGRAM command) to obtain the clustering tree for the given residue position. Find the clusters that contain residues joined above CLUSTER_CUT angstroms (1A is a good value). Use the largest cluster in the averaging for the target coordinates. Both kinds of averaging, but especially the cluster averaging, are useful for deriving a consensus model from an ensemble of models of the same sequence. If the consensus model is optimized by the conjugate gradients method, it frequently has a significantly lower value of the objective function than any of the contributing 38 CHAPTER 2. MODELLER COMMANDS models. Thus, the construction of a consensus model can also be seen as part of an efficient optimization. The reason why consensus construction frequently results in better models is that the consensus model generally picks the best (i.e., most frequent) conformation for the regions that are variable in the individual models, while it is very unlikely that a single model will have optimal conformation in all of the variable regions. The consensus construction may not work when two or more locally optimal conformations are inconsistent with each other (e.g., because of the atom overlaps). Two atoms are equivalent if they have exactly the same name and are in the equivalent residues. Note that the $ATMEQV_LIB library of equivalent residue-residue atom pairs, which is used in the construction of homology-derived distance restraints, is not used here. The atom names in the target may not correspond to the atom names in the template files. In such a case, if you want to copy the template atoms' coordinates, you have to edit the atom names in the template atom files so that they correspond to the Modeller atom names (which you can see in the .ini atom file). At least for water molecules, this is usually better than letting the optimizer deal with grossly incorrect starting positions. The atoms with undefined coordinates in MODEL are flagged by setting the coordinates to -999. The coordinates of the undefined atoms of the MODEL can be set with the BUILD_MODEL command, which relies on the internal coordinates specified in the residue topology library. Example: # Example for: TRANSFER_XYZ # This will build a model for a given sequence by copying # coordinates from aligned templates. When the templates # have the same sequence as the target, this procedure ensures # that the new model corresponds to the MODELLER topology library. READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' # Read the sequence and calculate its topology: READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '2ctx' '1nbt' MALIGN3D FIT = off SET ADD_SEQUENCE = on READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = ALIGN_CODES '1fas' GENERATE_TOPOLOGY SEQUENCE = '1fas' # Assign the average of the equivalent template coordinates to MODEL: TRANSFER_XYZ # Get the remaining undefined coordinates from internal coordinates: BUILD_MODEL INITIALIZE_XYZ = off # Write the fonal MODEL coordinates to a PDB file: WRITE_MODEL FILE = '1fas.ini' 2.3.7 TRANSFER__RES__NUMB _ residue numbers from MODEL2 to MODEL Options: ALIGN_CODES = 'all' codes of proteins in the alignment Requirements: MODEL & MODEL2 [& alignment] Description: This command transfers residue numbers and chain ids from MODEL2 to MODEL. It uses the current alignment if present, otherwise a 1:1 correspondence is assumed. MODEL2 and MODEL must 2.3. HANDLING OF ATOMIC COORDINATES 39 correspond to the first and second protein in the alignment, respectively. The ALIGN_CODES variable is used only for output to the log file, not in the calculation. Both MODEL and MODEL2 must already be in memory. Example: # Example for: TRANSFER_RES_NUMB # This will transfer residue numbers and chain ids from model2 to model. # Optionally, read an alignment for the transfer (otherwise 1:1 is assumed): READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '2ctx' '1fas' # Read the template and target models: READ_MODEL2 FILE = '2ctx' READ_MODEL FILE = '1fas' # Transfer the residue and chain ids and write out the new MODEL: TRANSFER_RES_NUMB WRITE_MODEL FILE = '1fas.ini' 2.3.8 RENAME__SEGMENTS _ rename MODEL segments Options: SEGMENT_IDS = '' new segment ids RENUMBER_RESIDUES = on whether to renumber residues from 1 to N Requirements: MODEL Description: This command re-labels residue numbers in each chain (i.e., segment) so that they start with 1 if RENUMBER_RESIDUES is on. In addition, the single character PDB chain id's are also assigned: They are obtained from the corresponding elements of SEGMENT_IDS. Thus, there should be as many elements in SEGMENT_IDS as there are chains in the current MODEL. Example: # Example for: RENAME_SEGMENTS # This will assign new PDB single-character chain id's to all the chains # in the input PDB file (here there are two 'chains': protein and the HETATM # water molecules). # Read the MODEL with all HETATM and water records (so there are two 'chains'): READ_MODEL FILE = '1fas', HETATM_IO = on, WATER_IO = on # Assign new segment names and write out the new model: RENAME_SEGMENTS SEGMENT_IDS = 'X' 'Y' WRITE_MODEL FILE = '1fas.ini' 2.3.9 PICK__ATOMS _ select atoms in MODEL Options: PICK_ATOMS_SET = 1 index of the selected atoms set: 1 _ 2 _ 3 40 CHAPTER 2. MODELLER COMMANDS SELECTION_SEARCH = 'SEGMENT' search method: 'SPHERE' _ 'SEGMENT' RES_TYPES = 'ALL' residue type selection: 'ALL' _ 'HET' _ 'STD' _ Charmm 4-letter codes ATOM_TYPES = 'ALL' atom type selection: 'ALL' _ 'SDCH' _ 'MNCH' _ IUPAC atom names SELECTION_FROM = 'ALL' selecting from: 'ALL' _ 'SELECTED' SELECTION_MODE = 'ATOM' selecting what: 'ATOM' _ 'RESIDUE' SELECTION_STATUS = 'INITIALIZE' what to do with selected atoms: 'ADD' _ 'REMOVE' _ 'INITIALIZE' o For SELECTION_SEARCH = 'SEGMENT': SELECTION_SEGMENT = '@:@' 'X:X' RES:CHN ids for the first and last residues in a chain/segment SELECTION_STEP = 1 increase in residue index when selecting residues o For SELECTION_SEARCH = 'SPHERE': SPHERE_CENTER = 'undefined' '#RES1:C' 'ATOM_NAME' 'undefined' SPHERE_RADIUS = 10.0 sphere radius for atoms selection SELECTION_SLAB = -9999 9999 0 0 0 slab for atoms selection: 'dz1' 'dz2' 'xtrans' 'ytrans' 'ztrans' o For SELECTION_SEARCH = 'SPHERE_SEGMENT': SELECTION_SEGMENT = '@:@' 'X:X' RES:CHN ids for the first and last residues in a chain/segment SPHERE_RADIUS = 10.0 sphere radius for atoms selection Description: This command adds atoms to, removes atoms from, or initializes any one of the three independent sets of selected atoms of MODEL. There are three selection sets because it is convenient to have different sets used by different Modeller commands. PICK_ATOMS_SET specifies the set of selected atoms. Set 1 is used in the PICK_RESTRAINTS, RO- TATE_DIHEDRALS, RANDOMIZE_XYZ and MUTATE_MODEL commands. Sets 2 and 3 are used in the MAKE_RESTRAINTS command. SELECTION_STATUS determines whether the selected atoms are added ('ADD'), removed ('REMOVE'), or a set is initialized and then the selected atoms are added ('INITIALIZE'). The selection of atoms is a hierarchical process consisting of two levels. The first level of selection consists of specifying how the atoms will be scanned. This is achieved by setting the SELECTION_SEARCH variable to either 'SEGMENT', 'SPHERE', or 'SPHERE_SEGMENT': 1. 'SEGMENT' mode: Search scans only a single stretch of residues specified by the beginning and ending residue identifiers in SELECTION_SEGMENT (Section 2.4.1). SELECTION_STEP is a step in the residue index used in scanning the atoms. This is useful in selecting only every 5-th CA atom, for example. 2. 'SPHERE' mode: Search scans only those atoms that are closer than SPHERE_RADIUS angstroms to the SPHERE_CENTER atom, after the center atom was translated by (xtrans; ytrans; ztrans) angstroms specified in SELECTION_SLAB[3:5]. If the first element of SPHERE_CENTER is string 'INDEX', the second element is an integer atom index of the center atom; otherwise, the first and second element are the residue identifier (Section 2.4.1) and the IUPAC atom name, respectively. SELECTION_SLAB[1:2] specifies the interval on the Z-axis relative to the Z coordinate of the translated central atom that imposes another condition on the selected atoms: Zcen + dz1 < Z + ztrans < Zcen + dz2. Larger Z values are in front, so dz1 specifies the plane that is further away than the dz2 plane. To pick any atoms, dz1 < dz2. 3. 'SPHERE_SEGMENT' mode: Search scans spheres around the atoms in the specified segment of residues. This is useful, for example, when a neighborhood of a loop needs to be selected. If SELECTION_FROM is 'SELECTED', scanning specified above is restricted only to the atoms that were already selected before calling PICK_ATOMS. 2.3. HANDLING OF ATOMIC COORDINATES 41 Once the method for scanning the atoms is specified, each of the scanned atoms is checked against the specified atom name(s) (ATOM_TYPES) and residue name(s) (RES_TYPES). If SELECTION_MODE is 'RESIDUE', all atoms in a residue with at least one atom that matches both the residue and atom name criteria are selected. Otherwise, only those atoms that have both the specified residue and atom names are selected. The RES_TYPES and ATOM_TYPES keywords can contain several residue and atom names in one quoted string or in several quoted strings. For example, both 'CA' 'N' and 'CA N' are valid specifications selecting the CA and N atoms. The following groups of residues and atoms are defined: o If RES_TYPES contains word 'ALL', all residues will be selected. o If RES_TYPES contains word 'HET', all `HETATM' residues will be selected (e.g., all residue types with the Modeller residue code larger than 27; see library $RESTYP_LIB). o If RES_TYPES contains word 'BLK', all `BLK' residue types will be selected (Section 2.2.1). o If RES_TYPES contains word 'STD', all standard residue types will be selected. Standard residue types are all residue types but `HETATM' and `BLK' types. o If ATOM_TYPES contains word 'ALL', all atoms will be selected. o If ATOM_TYPES contains word 'MNCH', all mainchain atoms will be selected. Mainchain atoms are N, C, CA, O, and OXT. o If ATOM_TYPES contains word 'SDCH', all sidechain atoms will be selected. Sidechain atoms are all non-mainchain atoms, including non-mainchain atoms in `HETATM' and `BLK' residues. Example: # Example for: PICK_ATOMS # This will pick various subsets of atoms in the MODEL and compare them # with MODEL2. # Read the models and the alignment: READ_MODEL FILE = '1fas' READ_MODEL2 FILE = '2ctx' READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '1fas' '2ctx' # Set some defaults (the same as in top.ini): SET SELECTION_MODE = 'ATOM' # only the selected atoms, not whole residues SET SELECTION_FROM = 'ALL' # scanning of all atoms, not selected atoms SET SELECTION_SEARCH = 'SEGMENT' # scan over a segment SET SELECTION_SEGMENT= 'X:' '@:' # the whole chain as a segment SET RES_TYPES = 'ALL' # all residue types SET PICK_ATOMS_SET = 1 # put the selected atoms in set 1 SET SELECTION_STATUS = 'INITIALIZE' # select only the selected atoms # Pick and superpose mainchain atoms: PICK_ATOMS ATOM_TYPES = 'MNCH' SUPERPOSE # Pick and superpose sidechain atoms: PICK_ATOMS ATOM_TYPES = 'SDCH' SUPERPOSE # Pick and superpose CA and CB atoms: PICK_ATOMS ATOM_TYPES = 'CA CB' SUPERPOSE # Pick and superpose all atoms: PICK_ATOMS ATOM_TYPES = 'ALL' 42 CHAPTER 2. MODELLER COMMANDS SUPERPOSE # Pick and superpose CA and CB atoms in one segment only: PICK_ATOMS ATOM_TYPES = 'CA CB', SELECTION_SEGMENT = '2:' '10:' SUPERPOSE SET SELECTION_SEGMENT = 'X:' '@:' # allow for the whole chain again # Pick and superpose all atoms within 6 angstroms of the 'CA' atom in residue '10:': PICK_ATOMS ATOM_TYPES = 'ALL', SPHERE_RADIUS = 6.0, ; SELECTION_SEARCH = 'SPHERE', SPHERE_CENTER = '10:' 'CA' SUPERPOSE 2.3.10 PICK__HOT__ATOMS _ pick atoms violating restraints Options: VIOL_REPORT_CUT = 4.5 4.5 4.5 4.5 4.5 cutoffs for selecting violated restraints 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 999 999 999 999 4.5 4.5 4.5 4.5 4.5 4.5 999 4.5 4.5 4.5 4.5 4.5 4.5 4.5 PICK_HOT_CUTOFF = 4.0 radius for picking hot atoms SELECTION_MODE = 'ATOM' selecting what: 'ATOM' _ 'RESIDUE' EXTEND_HOT_SPOT = 0 whether to extend hot spots Description: This command selects those selected atoms (set 1) in the MODEL that should be optimized to remove hot spots in the MODEL; only selected restraints are considered. More precisely, the command first flags violated selected atoms. An atom is violated if it is part of a violated restraint. A restraint of physical group i (Table 2.4) is violated when its relative deviation from the optimal value is larger than specified in VIOL_REPORT_CUT[i]. For restraints that are based on probability density functions, relative violation is defined as the difference between the actual and the ideal values divided by the standard deviation (`relative heavy violation'); energy based restraints have ad hoc definitions (Table 2.2). The command then flags those selected atoms that are within the PICK_HOT_CUTOFF angstroms of any of the already flagged atoms. Next, if SELECTION_MODE is 'RESIDUE', all atoms in the residues that have at least one atom flagged are also flagged. In addition, the contiguous segments of flagged residues are extended for EXTEND_HOT_SPOT residues on either side. This command is usually followed by the PICK_RESTRAINTS and OPTIMIZE commands to select all the restraints that operate on selected (hot) atoms and optimize positions of these hot atoms. Example: # Example for: PICK_HOT_ATOMS # This will pick atoms violated by some restraints (bond length restraints here), # select restraints operating on violated atoms, and calculate the energy for # the selected restraints only (note that a list of violated restraints # can be obtained by the ENERGY command alone, without preceding it with # PICK_HOT_ATOMS). 2.3. HANDLING OF ATOMIC COORDINATES 43 READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' # Read the sequence, calculate its topology and coordinates: READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' TRANSFER_XYZ # Just to get some violations: RANDOMIZE_XYZ DEVIATION = 0.03 # Create the bond length restraints and ignore the hard sphere overlap: MAKE_RESTRAINTS RESTRAINT_TYPE = 'bond', DYNAMIC_SPHERE = off # Pick hot atoms and the corresponding violated and neighbouring restraints: PICK_HOT_ATOMS PICK_RESTRAINTS ADD_RESTRAINTS = off # Calculate the energy of the selected restraints and write them out in detail: ENERGY OUTPUT = 'VERY_LONG' 2.3.11 RANDOMIZE__XYZ _ randomize MODEL coordinates Options: DEVIATION = 0.0 coordinate randomizaton amplitude in angstroms RAND_SEED = -8123 random seed from -50000 to -2 Description: This command randomizes the Cartesian coordinates of the selected atoms (set 1) in MODEL. If DEVIATION is positive, the coordinates are randomized by the addition of a random number uniformly dis- tributed in the interval from -DEVIATION to +DEVIATION angstroms. If DEVIATION is negative, the coor- dinates are assigned a random value uniformly distributed in the interval from -DEVIATION to +DEVIATION angstroms. Example: # Example for: RANDOMIZE_XYZ # This will randomize the X,Y,Z of the model: READ_MODEL FILE = '1fas' # Change existing X,Y,Z for +- 4 angstroms: RANDOMIZE_XYZ DEVIATION = 4.0 WRITE_MODEL FILE = '1fas.ini1' # Assign X,Y,Z in the range from -100 to 100 angstroms: RANDOMIZE_XYZ DEVIATION = -100.0 WRITE_MODEL FILE = '1fas.ini2' 44 CHAPTER 2. MODELLER COMMANDS 2.3.12 IUPAC__MODEL _ standardize certain dihedral angles Requirements: MODEL Description: This routine swaps specific pairs of atoms within some residues of MODEL so that certain dihedral angles are within 90O , satisfying the IUPAC convention [IUP, 1970, Kendrew et al., 1970]. These residues, pairs of atoms, and dihedral angles are: o Phe, Tyr: (CD1, CD2), (CE1, CE2); O2; o Asp: (OD1, OD2); O2; o Glu: (OE1, OE2); O3; o Arg: (NH1, NH2). It is possible that for distorted sidechains, neither of the two possibilities satisfies the IUPAC convention. In such a case, a warning message is written to the log file. Example: # This will swap certain atom names in some planar sidechains to satisfy # the IUPAC convention. READ_MODEL FILE = '2abx' IUPAC_MODEL WRITE_MODEL FILE = '2abx.iup' 2.3.13 REORDER__ATOMS _ standardize order of MODEL atoms Requirements: topology library & MODEL Description: This routine reorders atoms within the residues of MODEL so that they follow the order in the current residue topology library. Example: # Example for: REORDER_ATOMS, REORDER2_ATOMS # This will standardize the order of atoms in the model. # Order the atoms according to a topology library: READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_MODEL FILE = '1fas' REORDER_ATOMS WRITE_MODEL FILE = '1fas.ini1' # Order the atoms according to the first residue of the same # type in MODEL2: READ_MODEL FILE = '1fas' READ_MODEL2 FILE = '2abx' REORDER2_ATOMS WRITE_MODEL FILE = '1fas.ini2' 2.3. HANDLING OF ATOMIC COORDINATES 45 2.3.14 REORDER2__ATOMS _ order MODEL atoms by MODEL2 Requirements: MODEL & MODEL2 Description: This command reorders atoms within the residues of MODEL so that they follow the order of atoms in the first residue of the same type in MODEL2. Example: See REORDER_ATOMS command. 2.3.15 ROTATE__DIHEDRALS _ change dihedral angles Options: DIHEDRALS = 'PHI' 'PSI' 'CHI1' dihedral angle type selection: 'phi' _ 'psi' _ 'CHI2' 'CHI3' 'CHI4' 'omega' _ 'chi1' _ 'chi2' _ 'chi3' _ 'chi4' _ 'chi5' _ 'alpha' CHANGE = 'RANDOMIZE' what to do: 'RANDOMIZE' _ 'OPTIMIZE' DEVIATION = 0.0 coordinate randomizaton amplitude in angstroms RAND_SEED = -8123 random seed from -50000 to -2 Requirements: for CHANGE='OPTIMIZE': topology & MODEL & restraints for CHANGE='RANDOMIZE': topology & MODEL Description: This command changes the dihedral angles in MODEL. CHANGE selects an optimization (when equal to 'OPTIMIZE') or randomization (when equal to 'RANDOMIZE'): 1. When optimizing, this command finds the first selected restraint that restrains the specified dihedral angle of each selected residue. It then sets the value of that dihedral to the most likely value. A residue is selected if any of its atoms is in the set 1 of selected atoms. 2. When randomizing, the command changes the specified dihedral angle of each selected residue by adding a random value distributed uniformly from -DEVIATION to +DEVIATION degrees. The value of the random seed variable, RAND_SEED, is changed after returning from the RANDOMIZE command. Use a negative integer from -2 to -50000 as the seed for the random number generator. DIHEDRALS can be either a vector of dihedral angle names or a single string containing all the dihedral angle names separated by blanks. The dihedral angles involved in cyclic structures are not changed (e.g., sidechain dihedral angles in disulfide bonds and prolines). The dihedral angles that can be changed are listed at the top of the $RESDIH_LIB library: alpha, phi, psi, omega, chi1, chi2, chi3, chi4, chi5. Dihedral angle 'alpha' is the virtual C ff dihedral angle defined by four consecutive C ff atoms. The bond connectivity of the MODEL has to exist before this command is executed. If you read in the model by READ_MODEL, the bond connectivity is defined by subsequent calls to READ_TOPOLOGY and GENERATE_TOPOLOGY (also make sure that SEQUENCE entry does not exist in the alignment or that no alignment is in memory). Example: # Example for: ROTATE_DIHEDRALS # This will optimize and randomize dihedrals in a MODEL READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' 46 CHAPTER 2. MODELLER COMMANDS # Select dihedral angle types for optimization and randomization: SET DIHEDRALS = 'phi psi omega chi1 chi2 chi3 chi4 chi5' # Read the sequence, get its topology and coordinates: READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini1', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off ROTATE_DIHEDRALS CHANGE = 'RANDOMIZE', RAND_SEED = -2312, DEVIATION = 90.0 WRITE_MODEL FILE = '1fas.ini1' # Get restraints from somewhere and optimize dihedrals: MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' ROTATE_DIHEDRALS CHANGE = 'OPTIMIZE' WRITE_MODEL FILE = '1fas.ini2' 2.3.16 ORIENT__MODEL _ center and orient MODEL Description: This command translates the MODEL so that its gravity center is at the origin of the coordinate system and that the three principle axes of the model's inertia ellipsoid correspond to the x, y, and z axes of the coordinate system. It may even be used for approximate superposition if molecules have a similar non-spherical shape. Information about the principal axes is written to the log file. Example: # Example for: ORIENT_MODEL # This will orient the model along the principal axes of the inertia ellipsoid: READ_MODEL FILE = '1fas' ORIENT_MODEL WRITE_MODEL FILE = '1fas.ini' 2.3.17 ROTATE__MODEL _ rotate and translate MODEL Options: TRANSLATION = 0.0 0.0 0.0 translation vector for MODEL ROTATION_MATRIX = 1 0 0 0 1 0 0 0 1 rotation matrix for MODEL ROTATION_ANGLE = 0.0 rotation of MODEL around axis [degrees] ROTATION_AXIS = 1.0 0.0 0.0 rotation axis for MODEL Description: This command transforms the Cartesian coordinates of MODEL. Translation is specified by a translation vector TRANSLATION and is done first. 2.3. HANDLING OF ATOMIC COORDINATES 47 Rotation is specified by a rotation matrix ROTATION_MATRIX that is given as a vector of 9 elements (three rows times three columns), with column index running first: a11 ; a12 ; a13 ; a21 ; a22 ; a23 ; a31 ; a32 ; a33 . The ro- tation matrix pre-multiplies the Cartesian coordinate vectors. The matrix corresponds to the view matrix of Quanta and to the rotation matrix of Molscript . The second kind of rotation is specified by a screw transformation, given by the ROTATION_AXIS axis and ROTATION_ANGLE rotation around the axis (in degrees). This is done last. If only some transformations are desired, set the other values to 0. Example: # Example for: ROTATE_MODEL # This will orient a model as specified (no change in this example): # Read the structure and transform it: READ_MODEL FILE = '1fas' ROTATE_MODEL TRANSLATION = 0 0 0, ROTATION_MATRIX = 1 0 0 ; 0 1 0 ; 0 0 1,; ROTATION_ANGLE = 0, ROTATION_AXIS = 1 1 1 WRITE_MODEL FILE = '1fas.ini' 2.3.18 WRITE__DATA _ write derivative MODEL data Options: FILE = 'default' root of output filename(s) OUTPUT = 'LONG' what to calculate and write out: 'ALL' _ 'PSA' _ 'NGH' _ 'DIH' _ 'SSM' _ 'CRV' Requirements: topology file & TOPOLOGY_MODEL Description: This command writes the selected types of data about the MODEL to a corresponding file and to the `fourth' column of the model. The root of the output filenames is given by the FILE variable. In addition to the output files, the Biso field of the model (`fourth column' in the PDB file) will be assigned the last selected property from the following list: residue accessibility of type ACCESSIBILITY_TYPE (from 1 to 10, for the columns in the .psa file), dihedral type ACCESSIBILITY_TYPE (from 1 to 9 for ff, , , !, O1, O2, O3, O4, and O5; where ff is the virtual dihedral angle between four successive C ff atoms), number of residue neighbors, the secondary structure type, and the local mainchain curvature. The data to be calculated are specified by concatenating the corresponding keywords in the OUTPUT variable: o 'ALL': All types of data are written to the corresponding files. o 'PSA': The atomic and residue solvent accessibilities are written to the .sol and .psa files, respec- tively. The algorithm for the solvent contact areas is described in [Richmond & Richards, 1978]. The normalization for the fractional areas is carried out as described in [Hubbard & Blundell, 1987], with the normalization factors courtesy of Simon Hubbard (personal communication). The single reference for all this is [Sali & Overington, 1994]. o 'NGH': Residue neighbors of each residue are listed to a .ngh file. The Modeller definition of a residue- residue contact used in restraints derivation is applied [Sali & Blundell, 1993]: Any pair of residues that has any pair of atoms within 6A of each other is in contact. o 'DIH': All the dihedral angle types defined in the $RESDIH_LIB library (virtual C ff, mainchain, and sidechain dihedral angles) are written to a .dih file. 48 CHAPTER 2. MODELLER COMMANDS o 'SSM': Secondary structure assignments are written to a .ssm file. The algorithm for secondary structure assignment depends on the C ff positions only and is based on the distance matrix idea described in [Richards & Kundrot, 1988]. For each secondary structure type, a `library' C ff distance matrix was calculated by averaging distance matrices for several secondary structure segments from a few high resolution protein structures. Program Dssp was used to assign these secondary structure segments [Kabsch & Sander, 1983]. Outlier distances were omitted from the averaging. Currently, there are only two matrices: one for the ff-helix (secondary structure type 2) and one for the fi-strand (type 1). The algorithm for secondary structure assignment is as follows: 1. For each secondary structure type (begin with a helix, which can thus overwrite parts of strand if they overlap): - Define the degree of the current secondary structure fit for each C ff atom by Drms deviation (P1) and maximal distance difference (P2) obtained by comparing the library distance matrix with the distance matrix for a segment starting at the given C ff position; - Assign the current secondary structure type to all C ff's in all segments whose Drms deviation and maximal distance difference are less than some cutoffs (P1 < cut1, P1 < cut2) and are not already assigned to `earlier' secondary structure types; 2. Split kinked contiguous segments of the same type into separate segments: Kinking residues have both DRMS and maximal distance difference beyond their respective cutoffs (P1 > cut3, P2 > cut4). The actual single kink residue separating the two new segments of the same type is the central kinking residue. Note: we are assuming that there are no multiple kinks within one contiguous segment of residues of the same secondary structure type. The kink residue type is -2. 3. If the current secondary structure type is fi-strand: Eliminate those runs of strand residues that are not close enough to other strand residues separated by at least two other residues: P3 is minimal distance to a non-neighboring residue of the strand type (P3 < cut3). Currently, only one pass of this elimination is done, but could be repeated until self-consistency. 4. Eliminate those segments that are shorter than the cutoff (cut6) length (e.g., 5 or 6). 5. Remove the isolated kinking residues (those that occur on their own or begin or end a segment). o 'CRV': Local mainchain curvatures are written to a .crv file. Straightness of residue i is calculated as follows: Calculate the angle between two vectors i; i - 2 and i + 2; i for all residues i. Use the running window average method to smooth these angles (use weights 0.5, 1, and 0.5). A better method would be to calculate the angles between least-squares lines through a small number of C ff atoms on each side of the central atom, but that remains to be implemented. Example: # Example for: WRITE_DATA # This will calculate solvent accessibility, dihedral angles, and # residue-residue neighbors for a structure in the PDB file. # Get topology library for radii and the model without waters and HETATMs: READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' SET HETATM_IO = off, WATER_IO = off READ_MODEL FILE = '1fas' # Calculate residue solvent accessibilities, dihedral angles, and # residue-residue neighbors: WRITE_DATA FILE = '1fas', OUTPUT = 'PSA DIH NGH SSM CRV' 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 49 2.4 Comparison and searching of sequences and structures This section describes the format of the alignment file and commands for reading, writing, making, analyzing and using the alignments of sequences and structures (pairwise and multiple). It also includes a description of the command for searching a sequence database. For the underlying dynamic programming method see Section 6.1. 2.4.1 Alignment file format The preferred format for comparative modeling is related to the PIR database format: C; A sample alignment in the PIR format; used in tutorial >P1;5fd1 structureX:5fd1: 1 : : 106 : :ferredoxin:Azotobacter vinelandii: 1.90:0.192 AFVVTDNCIKCKYTDCVEVCPVDCFYEGPNFLVIHPDECIDCALCEPECPAQAIFSEDEVPEDMQEFIQLNAELA EVWPNITEKKDPLPDAEDWDGVKGKLQHLER* >P1;1fdx sequence:1fdx: 1 : : 54 : :ferredoxin:Peptococcus aerogenes: 2.00:-1.00 AYVINDSC--IACGACKPECPVNIIQGS--IYAIDADSCIDCGSCASVCPVGAPNPED* The first line of each sequence entry specifies the protein code after the >P1; line identifier. The line identifier must occur at the beginning of the line. For example, 1fdx is the protein code of the first entry in the alignment above. The protein code corresponds to the ALIGN_CODES variable. The second line of each entry contains information necessary to extract atomic coordinates of the segment from the original PDB coordinate set. The fields in this line are separated by colon characters, `:'. The fields are as follows: Field 1: A specification of whether or not 3D structure is available and of the type of the method used to obtain the structure (structureX, X-ray; structureN, NMR; structureM, model; sequence, sequence). Only structure is also a valid value. Field 2: The PDB code. While the protein code in the first line of an entry, which is used to identify the entry, must be unique for all proteins in the file, the PDB code in this field, which is used to get structural data, does not have to be unique. It is a good idea to use the PDB code with an optional chain identifier as the protein code. The PDB code corresponds to the ATOM_FILES variable and can also contain the full atom filename, directory included. Fields 3-6: The residue identifiers (see below) for the first (fields 3-4) and last residue (fields 5-6) of the sequence in the subsequent lines. There is no need to edit the coordinate file if a contiguous sequence of residues is required _ simply specify the beginning and ending residues of the required contiguous region of the chain. If the beginning residue is not found, no segment is read in. If the ending residue identifier is not found in the coordinate file, the last residue in the coordinate file is used. By default, the whole file is read in. The unspecified beginning and ending residue numbers and chain id's for a structure entry in an alignment file are taken automatically from the corresponding atom file, if possible. The first matching sequence in the atom file that also satisfies the explicitly specified residue numbers and chain id's is used. A residue number is not specified when a blank character or a dot, `.', is given. A chain id is not specified when a dot, `.', is given. This slight difference between residue and chain id's is necessary because a blank character is a valid chain id. Field 7: Protein name. Optional. Field 8: Source of the protein. Optional. Field 9: Resolution of the crystallographic analysis. Optional. 50 CHAPTER 2. MODELLER COMMANDS Field 10: R-factor of the crystallographic analysis. Optional. A residue identifier consists of a residue number and an optional chain identifier. They must be separated by a colon, `:'. For example, '10I:A' is residue number '10I' in chain 'A', and '6' or '6:' is residue number '6' in a chain without a name. Free format can be used, that is the blank characters are ignored. The residue number is a string of up to 5 characters long, as found in the PDB atom file and consists of the PDB residue number proper (22X,A4 in the PDB ATOM record) and PDB residue insertion code (26X, A1). The chain identifier is a single character, as found in the PDB atom file (21X,A1). A string containing `@' will match any residue number and chain id. For example, `@:A' is the first residue in chain `A' and `@:@' is the first residue in the coordinate file. The same residue identifier format is used in several other Top variables used for residue specification (e.g., MODEL_SEGMENT, SELECTION_SEGMENT). When an alignment file is used in conjunction with structural information, the first two fields must be filled in, the rest of them can be empty or even missing entirely. If the alignment is not used in conjunction with structural data, all but the first field can be empty. This means that in comparative modeling, the template structures must have at least the first two fields specified while the target sequence must only have the first field filled in. Thus, a simple second line of an entry in an alignment file in the 'PIR' format is structure:pdb_file:.:.:.:. This entry will result in reading from PDB file pdb_file the structure segment corresponding to the sequence in the subsequent lines of the alignment entry. The fields that do not exist are assigned blank values. Thus, structure:pdb_file is equivalent to structure:pdb_file: : : : : : : : which will achieve what was probably intended (read in the structure segment from file pdb_file that corre- sponds to the sequence in the subsequent lines of the alignment entry) only if the chain id is a blank character. Each sequence must be terminated by the terminating character, `*'. When the first character of the sequence line is the terminating character, `*', the sequence is obtained from the specified PDB coordinate file (Section 2.1.3). Chain breaks are indicated by `/'. There should not be more than one chain break character to indicate a single chain break (use gap characters instead, `-'). All residue types specified in $RESTYP_LIB, but not patching residue types, are allowed; there are on the order of 100 residue types specified in the $RESTYP_LIB library. To add your own residue types to this library, see Chapter 4, Question 18. The alignment file can contain any number of blank lines between the protein entries. Comment lines can occur outside protein entries and must begin with the identifiers `C;' or `R;' as the first two characters in the line. An alignment file is also used to input non-aligned sequences. The best way to generate initial alignment files containing PDB sequences, which can later be edited by hand, is to follow this example: # Specify the PDB and protein codes in the alignment: SET ATOM_FILES = '1fdx' '5fd1', ALIGN_CODES = '1fdx' '5fd1' READ_MODEL FILE = '1fdx', MODEL_SEGMENT = '@:@' 'X:X' # Read the whole 1fdx atom file SEQUENCE_TO_ALI # Copy the residues to the alignment array READ_MODEL FILE = '5fd1', MODEL_SEGMENT = '1:' '63:' # Read 5fd1 atom file from 1-63 SEQUENCE_TO_ALI ADD_SEQUENCE = on # Add this segment to the alignment array MALIGN GAP_PENALTIES = -500 -300 # align them by sequence WRITE_ALIGNMENT FILE = 'fer1-seq.ali' MALIGN3D GAP_PENALTIES = 0.0 2.0 # align them by structure CHECK_ALIGNMENT # check the alignment for its suitability for modeling WRITE_ALIGNMENT FILE = 'fer1.ali' 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 51 2.4.2 READ__ALIGNMENT _ read sequences and/or their alignment Options: FILE = 'default' partial or complete filename DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') ALIGN_CODES = 'all' codes of proteins in the alignment ALIGNMENT_FORMAT = 'PIR' format of the alignment file: 'PIR' _ 'PAP* *' _ 'QUANTA' _ 'INSIGHT' REMOVE_GAPS = on whether to remove all-gap positions in input alig* *n- ment ADD_SEQUENCE = off whether to add the new sequences to the existing alignment STOP_ON_ERROR = 1 whether to stop on error Output: MODELLER_STATUS = , NUMB_OF_SEQUENCES, ALIGN_CODES Description: This command reads the sequence(s) and/or their alignment from a text file. Only sequences with the specified codes are read in; ALIGN_CODES = 'all' can be used to read all sequences. There are several alignment formats: 1. The 'PIR' format resembles that of the PIR sequence database. It is described in Section 2.4.1 and is used for comparative modeling because it allows for additional data about the proteins that are useful for automated access to the atomic coordinates. 2. The 'PAP' format is nicer to look at but contains less information and is not used by other programs. When used in conjunction with PDB files, the PDB files must contain exactly the residues in the sequences in the 'PAP' file; i.e., it is not possible to use only a segment of a PDB file. In addition, the 'PAP' protein codes must be expandable in proper PDB atom filenames, as described in Section 2.1.3. 3. The 'QUANTA' format can be used to communicate with the Quanta program. You are not supposed to mix 'QUANTA' format with any other format because the 'QUANTA' format contains residue numbers which do not occur in the other formats and are difficult to guess correctly. 4. The 'INSIGHT' format is very similar to the 'PAP' format and can sometimes be used to communicate with the InsightII program. When used in conjunction with PDB files, the same rules as for the 'PAP' format apply. If REMOVE_GAPS = on, positions with gaps in all selected sequences are removed from the alignment. If ADD_SEQUENCE is on, the new sequences are added to the current ones, otherwise the old sequences are deleted. Example: # Example for: READ_ALIGNMENT, WRITE_ALIGNMENT, # READ_ALIGNMENT2, WRITE_ALIGNMENT2, # CHECK_ALIGNMENT # Read an alignment, write it out in the 'PAP' format, and # check the alignment of the N-1 structures as well as the # alignment of the N-th sequence with each of the N-1 structures. READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = 'all' WRITE_ALIGNMENT FILE = 'toxin.pap', ALIGNMENT_FORMAT = 'PAP' CHECK_ALIGNMENT 52 CHAPTER 2. MODELLER COMMANDS 2.4.3 READ__ALIGNMENT2 _ read 2nd alignment Options: FILE = 'default' partial or complete filename DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') ALIGN_CODES = 'all' codes of proteins in the alignment ALIGNMENT_FORMAT = 'PIR' format of the alignment file: 'PIR' _ 'PAP* *' _ 'QUANTA' _ 'INSIGHT' REMOVE_GAPS = on whether to remove all-gap positions in input alig* *n- ment STOP_ON_ERROR = 1 whether to stop on error Output: MODELLER_STATUS = Description: This command reads the sequences and/or their alignment from a text file into the second alignment array. The two alignments can be compared by the COMPARE_ALIGNMENTS command. Example: See READ_ALIGNMENT command. 2.4.4 CHECK__ALIGNMENT _ check alignment for modeling Options: ATOM_FILES = '' complete or partial atom filenames ALIGN_CODES = 'all' codes of proteins in the alignment Description: This command evaluates an alignment to be used for comparative modeling. It uses two criteria: First, it checks the alignment of the template structures (all but the last entry in the alignment): For each pairwise superposition of the templates, it reports those equivalent pairs of C ff atoms that are more than 6A away from each other. Such pairs are almost certainly misaligned. The pairwise superpositions are done using the C ff atoms and the given alignment. Second, the command checks the alignment of the target sequence (the last entry in the alignment) with each of the templates: For all consecutive pairs of C ff atoms in the target, it calculates the distance between the two equivalent C ffatoms in each of the templates. If the distance is longer than 8A , it is reported. In such a case, the alignment between the template and the target is almost certainly incorrect. Example: # Example for: READ_ALIGNMENT, WRITE_ALIGNMENT, # READ_ALIGNMENT2, WRITE_ALIGNMENT2, # CHECK_ALIGNMENT # Read an alignment, write it out in the 'PAP' format, and # check the alignment of the N-1 structures as well as the # alignment of the N-th sequence with each of the N-1 structures. READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = 'all' WRITE_ALIGNMENT FILE = 'toxin.pap', ALIGNMENT_FORMAT = 'PAP' CHECK_ALIGNMENT 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 53 2.4.5 COLOR__ALN__MODEL _ color MODEL according to alignment Description: This command colors MODEL according to a given alignment between MODEL and a sequence. MODEL has to be the first protein in the alignment. The second protein can be any sequence, with or without known structure. The MODEL can be displayed on the screen, colored by `the fourth parameter' and inspected for the structural context of deletions and insertions. This is useful in optimizing the alignment for comparative modeling. The isotropic temperature factors in MODEL are set as follows: o 0, for those regions that have residues in both MODEL and the sequence (blue in Rasmol ; light green in Quanta ); o 1, for the two residues that span regions occurring in the sequence but not in MODEL (green in Rasmol ; pink in Quanta ); o 2, regions that occur in MODEL but are deleted from the sequence (red in Rasmol ; bright green in Quanta ). Example: # Example for: COLOR_ALN_MODEL # Two demos: # # 1) Use a given alignment to color a structure according to # insertions and deletions in a pairwise alignment. # # 2) Superpose two 3D structure and do (1). # Demo 1: READ_MODEL FILE = '1nbt' READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '1nbt' '1fas', ; REMOVE_GAPS = on COLOR_ALN_MODEL WRITE_MODEL FILE = '1nbt-1.clr' # Demo 2: SET ATOM_FILES = '1nbt' '1fas', ALIGN_CODES = '1nbt' '1fas' READ_MODEL FILE = '1nbt', MODEL_SEGMENT = '1:A' '66:A' SEQUENCE_TO_ALI READ_MODEL FILE = '1fas', MODEL_SEGMENT = '1:' '61:' SEQUENCE_TO_ALI ADD_SEQUENCE = on ALIGN GAP_PENALTIES_1D= -600 -400 MALIGN3D GAP_PENALTIES_3D= 0 3.0 WRITE_ALIGNMENT FILE = 'color_aln_model.pap', ALIGNMENT_FORMAT = 'PAP' READ_MODEL FILE = '1nbt', MODEL_SEGMENT = '1:A' '66:A' COLOR_ALN_MODEL WRITE_MODEL FILE = '1nbt-2.clr' 2.4.6 COMPARE__ALIGNMENTS _ compare two alignments Requirements: READ_ALIGNMENT & READ_ALIGNMENT2 54 CHAPTER 2. MODELLER COMMANDS Description: This command compares two pairwise alignments read by the READ_ALIGNMENT and READ_- ALIGNMENT2 commands. The alignment of the first sequence with the second sequence in ALIGNMENT is evaluated with respect to ALIGNMENT2. The numbers are not symmetric; i.e., they will change if the sequences or alignments are swapped. The output in the log file is supposed to be self-explanatory. Example: # Example for: COMPARE_ALIGNMENTS, SEQUENCE_TO_ALI # Compare two alignments of two proteins each. In this case, the first # alignment is a sequence-sequence alignment and the second alignment # is a structure-structure alignment. # ATOM_FILES and ALIGN_CODES have to be set explicitly so that the alignment # file has this information (ATOM_FILES is copied to the alignment array # during SEQUENCE_TO_ALI): SET ALIGN_CODES = '1fas' '2ctx', ATOM_FILES = ALIGN_CODES # Generate and save sequence-sequence alignment: READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI READ_MODEL FILE = '2ctx' SEQUENCE_TO_ALI ADD_SEQUENCE = on ALIGN GAP_PENALTIES_1D= -600 -400 WRITE_ALIGNMENT FILE = 'toxin-seq.ali' # Generate and save structure-structure alignment: ALIGN3D GAP_PENALTIES_3D= 0 2.0 WRITE_ALIGNMENT FILE = 'toxin-str.ali' DELETE_ALIGNMENT # Compare the two pairwise alignments: READ_ALIGNMENT FILE = 'toxin-str.ali' READ_ALIGNMENT2 FILE = 'toxin-seq.ali' COMPARE_ALIGNMENTS 2.4.7 SEQUENCE__TO__ALI _ copy MODEL sequence and coordinates to alignment Options: ADD_SEQUENCE = off whether to add the new sequences to the existing alignment ALIGN_CODES = 'all' codes of proteins in the alignment OUTPUT_DIRECTORY = '' output directory Output: NUMB_OF_SEQUENCES, ALIGN_CODES, ATOM_FILES Description: This command copies the sequence and coordinates of the MODEL to the alignment arrays. If ADD_SEQUENCE is on the sequence is added to the sequences that are already in the alignment arrays, otherwise it becomes the only sequence in those arrays. When sequence i is added, the corresponding elements of ALIGN_CODES and ATOM_FILES are used to set the protein and PDB code fields in the alignment file, respectively. 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 55 Example: See COMPARE_ALIGNMENTS command. 2.4.8 WRITE__ALIGNMENT _ write sequences and/or their alignment Options: FILE = 'default' partial or complete filename OUTPUT_DIRECTORY = '' output directory ATOM_FILES = '' complete or partial atom filenames ALIGN_CODES = 'all' codes of proteins in the alignment ALIGNMENT_FORMAT = 'PIR' format of the alignment file: 'PIR' _ 'PAP* *' _ 'QUANTA' _ 'INSIGHT' ALIGN_BLOCK = 1 the last sequence in the first block of sequences ALIGN_ALIGNMENT = TYPEVALUES DEFAULT DESCRIPTION ALIGNMENT_FEATURES = 'INDICES what alignment features to write out: 'ACCURACY' CONSERVATION' _ 'HELIX' _ 'BETA' _ 'ACCESSIBILITY' _ 'STRAIGHTNESS' _ 'CONSERVATION' _ 'INDICES' _ 'GAP1' _ 'GAP2' _ 'GAPS' CUT_OVERHANGS = off whether to cut overhangs at OVERHANG residues or not OVERHANG = 0 un-penalized overhangs in protein comparisons HETATM_IO = off whether to read HETATM coordinates WATER_IO = off whether to read water coordinates HYDROGEN_IO = off whether to read hydrogen coordinates ATOM_FILES_DIRECTORY = './' input atom files directory list (e* *.g., 'dir1:dir2:dir3:./:/') Description: This command writes the whole alignment to a text file. The 'PAP' format, which corresponds to a relatively nice looking alignment, has several additional formatting options that can be selected by the ALIGNMENT_FEATURES variable. This scalar variable can contain any combination of the following keywords: o 'INDICES', the alignment position indices; o 'CONSERVATION', a star for each absolutely conserved position; o 'ACCURACY', the alignment accuracy indices, scaled between 0-9, as calculated by ALIGN_CONSENSUS; o 'HELIX', average content of helical residues for structures 1 - ALIGN_BLOCK at each position, 0 for 0% and 9 for 100%, as calculated by ALIGN2D. o 'BETA', average content of fi-strand residues for structures 1 - ALIGN_BLOCK at each position, 0 for 0% and 9 for 100%, as calculated `by ALIGN2D. o 'ACCESSIBILITY', average relative sidechain buriedness for structures 1 - ALIGN_BLOCK, 0 for 0% (100% accessibility) and 9 for 100% (0% accessibility), as calculated by ALIGN2D; o 'STRAIGHTNESS', average mainchain straightness structures 1 - ALIGN_BLOCK at each position 0 for 0% and 9 for 100%, as calculated by ALIGN2D. o 'GAP1', penalty factor for opening a gap in block 1, scaled between 0 and 9, as calculated by ALIGN2D. o 'GAP2', penalty factor for opening a gap in block 2, scaled between 0 and 9, as calculated by ALIGN2D. Options 'HELIX', 'BETA', 'ACCESSIBILITY', 'STRAIGHTNESS', 'GAP1', and 'GAP2' are valid only after executing command ALIGN2D, where the corresponding quantities are defined. They refer to the 3D profile defined for the first ALIGN_BLOCK structures (run ALIGN2D with FIT = off to prepare these structural data without changing the input alignment). Similarly, the 'ACCURACY' option is valid only after the CONSENSUS_ALIGNMENT command. 56 CHAPTER 2. MODELLER COMMANDS ALIGN_ALIGNMENT and ALIGN_BLOCK are used to ensure correct indication of identical alignment positions, depending on whether sequences or two blocks of sequences were aligned: For sequences (ALIGN_ALIGNMENT = off and ALIGN_BLOCK is ignored), a '*' indicating a conserved position is printed where all sequences have the same residue type. For blocks (ALIGN_ALIGNMENT = on and ALIGN_BLOCK indicates the last sequence of the first block), a '*' is printed only where the two blocks have the same order of residue types (there has to be the same number of sequences in both blocks). The blocks option is useful when comparing two alignments, possibly aligned by the ALIGN command. If CUT_OVERHANGS is on, overhangs longer than OVERHANG residues are cut from the alignment. In such a case, the HETATM_IO, WATER_IO, HYDROGEN_IO, and ATOM_FILES_DIRECTORY keywords also apply because the beginning and ending residue numbers for the `structure' entries in the alignment are renumbered automatically by reading the appropriate atom files. Example: See READ_ALIGNMENT command. 2.4.9 DESCRIBE _ describe proteins Options: ATOM_FILES = '' complete or partial atom filenames ATOM_FILES_DIRECTORY = './' input atom files directory list (e* *.g., 'dir1:dir2:dir3:./:/') ALIGN_CODES = 'all' codes of proteins in the alignment Requirements: [alignment] Description: This command outputs basic data about the proteins whose atom filenames are specified by ATOM_- FILES or ALIGN_CODES. An alternative specification of the proteins to be described can be provided by an alignment in memory; i.e., READ_ALIGNMENT followed by DESCRIBE will describe all the proteins in the alignment. This command is useful for preparation before comparative modeling because it summarizes disulfides, cis-prolines, charges, chain breaks, etc . When an alignment is given, results depending only on the amino acid sequences are still written out even if some atom files do not exist. Example: # Example for: DESCRIBE # Describe the sequences and structures in the alignment. READ_ALIGNMENT FILE = 'toxin.ali' DESCRIBE 2.4.10 ID__TABLE _ calculate percentage sequence identities Options: ALIGN_CODES = 'all' codes of proteins in the alignment MATRIX_FILE = 'family.mat' the filename of the pairwise distance matrix OUTPUT_DIRECTORY = '' output directory Requirements: alignment 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 57 Description: This command calculates percentage residue identities for all pairs of sequences in the current alignment. The percentage residue identity is defined as the number of identical residues divided by the length of the shorter sequence. The ALIGN_CODES variable is only used for output, not in calculations, so it does not have to be set. In addition to the output in the log file, this routine creates file OUTPUT_DIRECTORY/MATRIX_FILE with pairwise sequence distances that can be used directly as the input to the tree making programs of the Phylip package, such as Kitsch [Felsenstein, 1985], and also for the DENDROGRAM and PRINCIPAL_- COMPONENTS commands. A more general version of this command, which allows a user specified measure for residue-residue differences is SEQUENCE_COMPARISON. Example: # Example for: ID_TABLE, SEQUENCE_COMPARISON, PRINCIPAL_COMPONENTS, DENDROGRAM # Pairwise sequence identity between sequences in the alignment. # Read all entries in this alignment: READ_ALIGNMENT FILE = 'toxin.ali' # Calculate pairwise sequence identities: ID_TABLE MATRIX_FILE = 'toxin_id.mat' # Calculate pairwise sequence similarities: SET RR_FILE = '$(LIB)/as1.sim.mat', MAX_GAPS_MATCH = 1 SEQUENCE_COMPARISON MATRIX_FILE = 'toxin.mat', VARIABILITY_FILE = 'toxin.var' # Do principal components clustering using sequence similarities: PRINCIPAL_COMPONENTS MATRIX_FILE = 'toxin.mat', FILE = 'toxin.princ' # Dendrogram in the log file: DENDROGRAM 2.4.11 SEQUENCE__COMPARISON _ compare sequences in alignment Options: RR_FILE = '$(LIB)/as1.sim.mat' input residue-residue scoring file DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') MATRIX_FILE = 'family.mat' the filename of the pairwise distance matrix VARIABILITY_FILE = 'undefined' output filename OUTPUT_DIRECTORY = '' output directory ALIGN_CODES = 'all' codes of proteins in the alignment MAX_GAPS_MATCH = 1 Description: The pairwise similarity of sequences in the current alignment is evaluated using a user specified residue-residue scores. The residue-residue scores, including gap-residue scores, are read from file RR_FILE. The sequence pair score is equal to the average pairwise residue-residue score for all alignment positions that have at most MAX_GAPS_MATCH gaps (1 by default). The comparison matrix is written in the Phylip format to file MATRIX_FILE. 58 CHAPTER 2. MODELLER COMMANDS The family variability as a function of alignment position is calculated as the Rms deviation of all residue - residue scores at a given position, but only for those alignment positions that have at most MAX_GAPS_- MATCH gaps. The variability is written to file VARIABILITY_FILE, as is the number of pairwise comparisons contributing to each positional variability. Example: See ID_TABLE command. 2.4.12 DENDROGRAM _ clustering Options: MATRIX_FILE = 'family.mat' the filename of the pairwise distance matrix Description: This command calculates a clustering tree from the input matrix of pairwise distances. This matrix must be in the Phylip format and can be produced by the ID_TABLE, SEQUENCE_COMPARISON, or COMPARE commands. The weighted pair-group average clustering method is used. The tree is written to the log file. This command is useful for deciding about which known 3D structures are to be used as templates for comparative modeling. Example: See ID_TABLE command. 2.4.13 PRINCIPAL__COMPONENTS _ clustering Options: MATRIX_FILE = 'family.mat' the filename of the pairwise distance matrix FILE = 'default' output file Description: This command calculates principal components clustering for the input matrix of pairwise distances. This matrix must be in the Phylip format and can be produced by the ID_TABLE, SEQUENCE_- COMPARISON, or COMPARE commands. The projected coordinates p and q are written to file FILE. The output file can be used with Asgl to produce a principal components plot. This command is useful for deciding about which known 3D structures are to be used as templates for comparative modeling. Example: See ID_TABLE command. 2.4.14 ALIGN _ align two (blocks of ) sequences Options: RR_FILE = '$(LIB)/as1.sim.mat' input residue-residue scoring file DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') GAP_PENALTIES_1D = -400 -150 gap creation and extension penalties for se- quence/sequence alignment ALIGN_BLOCK = 1 the last sequence in the first block of sequences STOP_ON_ERROR = 1 whether to stop on error OFF_DIAGONAL = 100 to speed up the alignment MATRIX_OFFSET = 0.00 substitution matrix offset for local alignment OVERHANG = 0 un-penalized overhangs in protein comparisons 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 59 LOCAL_ALIGNMENT = off whether to do local as opposed to global align- ment ALIGN_WHAT = 'BLOCK' what to align in ALIGN; 'BLOCK' _ 'ALIGNMENT' _ 'LAST' Output: MODELLER_STATUS = Description: This command aligns two blocks of sequences. The two blocks of sequences to be aligned are sequences 1 to ALIGN_BLOCK and ALIGN_BLOCK+1 to the last sequence. The sequences within the two blocks should already be aligned; their alignment does not change. The command can do either the global (similar to [Needleman & Wunsch, 1970]; LOCAL_ALIGNMENT = off) or local dynamic programming alignment (similar to [Smith & Waterman, 1981]; LOCAL_ALIGNMENT = on). For the global alignment, set overhang length OVERHANG to more than 0 so that the corresponding number of residues at either of the four termini won't be penalized by any gap penalties (this makes it a pseudo local alignment). To speed up the search, set OFF_DIAGONAL to a number smaller than the shortest sequence length. The alignments matching residues i and j with |i - j| > OFF_DIAGONAL are not considered at all in the search for the best alignment. The gap initiation and extension penalties are specified by GAP_PENALTIES_1D. The residue type - residue type scores are read from file RR_FILE. The routine automatically determines whether it has to maximize similarity or minimize distance. MATRIX_OFFSET applies to local alignment only and influences its length. MATRIX_OFFSET should be somewhere between the lowest and highest residue-residue scores. A smaller value of this parameter will make the local alignments shorter when distance is minimized, and longer when similarity is maximized. This works as follows: The recursively constructed dynamic programming comparison matrix is reset to 0 at position i; j when the current alignment score becomes larger (distance) or smaller (similarity) than MATRIX_OFFSET. Note that this is equivalent to the usual shifting of the residue-residue scoring matrix in the sense that there are two combinations of GAP_PENALTIES_1D and MATRIX_OFFSET values that will give exactly the same alignments irrespective of whether the matrix is actually offset (with 0 used to restart local alignments in dynamic programming) or the matrix is not offset but MATRIX_OFFSET is used as the cutoff for restarting local alignments in dynamic programming. For the same reason, the matrix offset does not have any effect on the global alignments if the gap extension penalty is also shifted for half of the matrix offset. The position-position score is an average residue-residue score for all possible pairwise comparisons between the two blocks (n x m comparisons are done, where n and m are the number of sequences in the two blocks, respectively). The first exception to this is when ALIGN_WHAT is set to 'ALIGNMENTS', in which case the two alignments defined by ALIGN_BLOCK are aligned; i.e., the score is obtained by comparing only equivalent positions between the two alignment blocks (only n comparisons are done, where n is the number of sequences in each of the two blocks). This option is useful in combination with COMPARE_ALIGNMENTS and WRITE_ALIGNMENT for evaluation of various alignment parameters and methods. The second excep- tion is when ALIGN_WHAT is set to 'LAST', in which case only the last sequences in the two blocks are used to get the scores. In 'block', 'alignment', and 'last' comparisons, penalty for a comparison of a gap with a residue during the calculation of the scoring matrix is obtained from the score file (gap-gap match should have a score of 0.0). Only the 20 standard residue types, plus Asx (changes to Asn) and Glx (changes to Gln) are recognized. Every other unrecognized residue, except for a gap and a chain break, changes to Gly for comparison purposes. If you receive an error message to increase the MAXRES constant, you can try to increase the gap penalties first. Here and elsewhere in Modeller , MAXRES is both the maximal number of residues in a protein as well as the maximal length of an alignment. If the length of the alignment arrays is too small, MODELLER_STATUS becomes 1 (Section 2.1.2). All the other problems with this Top command, such as the number of sequences larger than MAXSEQ, stop the execution on the spot. 60 CHAPTER 2. MODELLER COMMANDS For the time being, this and the other alignment commands (MALIGN, ALIGN2D, ALIGN3D, and MALIGN3D) remove chain break information from the CALN array, which means that chain breaks are not retained when the alignment is written to a file after executing these commands. Example: # Example for: ALIGN # This will read two sequences, align them, and write the alignment # to a file: READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '1fas' '2ctx' # The as1.sim.mat similarity matrix is used by default: ALIGN GAP_PENALTIES_1D = -600 -400 WRITE_ALIGNMENT FILE = 'toxin-seq.ali' 2.4.15 ALIGN2D _ align sequences with structures Options: RR_FILE = '$(LIB)/as1.sim.mat' input residue-residue scoring file DIRECTORY = '' directory of RR_FILE GAP_PENALTIES_1D = -400 -150 gap creation and extension penalties for se- quence/sequence alignment GAP_PENALTIES_2D = 0.5 0.5 0.5 0.5 1.0 gap penalties for sequence/structure alignmen* *t: 6.0 helix, beta, accessibility, straightness, and* * CA- CA distance ALIGN_BLOCK = 1 the last sequence in the first block of sequences STOP_ON_ERROR = 1 whether to stop on error OFF_DIAGONAL = 100 to speed up the alignment MATRIX_OFFSET = 0.00 substitution matrix offset for local alignment OVERHANG = 0 un-penalized overhangs in protein comparisons LOCAL_ALIGNMENT = off whether to do local as opposed to global align- ment FIT = on whether to align Output: MODELLER_STATUS = Description: This command aligns a block of sequences (second block) with a block of structures (first block). It is the same as the ALIGN command except that a variable gap opening penalty is used. This gap penalty depends on the 3D structure of all sequences in block 1. The variable gap penalty can favor gaps in exposed regions, avoid gaps within secondary structure elements, favor gaps after curved parts of the mainchain, and minimize the distance between the two C ffpositions spanning a gap. The ALIGN2D command is preferred for aligning a sequence with structure(s) in comparative modeling because it tends to place gaps in a better structural context. See Section 6.1.2 for the dynamic programming algorithm that implements the variable gap penalty. The algorithms used by this command are not fully evaluated yet and are likely to change in the near future. The linear gap penalty function for inserting a gap in block 1 of structures is: g = f1(H; S; B; C)u + lv where u and v are the usual gap opening and extension penalties, l is gap length, and f1 is a function that is at least 1, but can be larger to make gap opening more difficult in the following circumstances: between two consecutive (i.e., i; i + 1) helical positions, two consecutive fi-strand positions, two consecutive 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 61 buried positions, or two consecutive positions where the mainchain is locally straight. This function is f1 = 1 + [!H HiHi+1 + !S SiSi+1 + !B BiBi+1 + !C CiCi+1 ], where weights ! are the first four numbers in variable GAP_PENALTIES_2D, Hi is the fraction of helical residues at position i in block 1, Si is the fraction of fi-strand residues at position i in block 1, Bi is the average relative sidechain buriedness of residues at position i in block 1, and Ci is the average straightness of residues at position i in block 1. See Section 2.3.18 for the definition of these features. The original straightness is modified here by assigning maximal straightness of 1 to all residues in a helix or a fi-strand. The linear gap penalty function for opening a gap in block 2 of sequences is: g = f2(H; S; B; C; D)u + lv where f2 is a function that is at least 1, but can be larger to make the gap opening in block 2 more difficult in the following circumstances: when the first gap position is aligned with a helical residue, a fi-strand residue, a buried residue, extended mainchain, or when the whole gap in block 2 is spanned by two residuesp_in__block 1 that are far apart in space. This function is f2 = 1 + [!H Hi + !S Si + !B Bi + !C Ci + !D d - do ]. d is the distance between the two C ff atoms spanning the gap, averaged over all structures in block 1 and do is the distance that is small enough to correspond to no increase in the opening gap penalty (e.g., 6A ). Parameters !H , !S , !B , !C , !D , and do are specified by GAP_PENALTIES_2D. When FIT is off, no alignment is done and the routine returns only the average structural information, which can be written out by the WRITE_ALIGNMENT command. Example: # Demonstrating ALIGN2D, aligning with variable gap penalty # Read aligned structure(s): READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '2ctx' '2abx' SET ADD_SEQUENCE = on, ALIGN_BLOCK = NUMB_OF_SEQUENCES # Read aligned sequence(s): READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '1nbt' # Structure sensitive variable gap penalty sequence-sequence alignment: SET OVERHANG = 0 ALIGN2D GAP_PENALTIES_1D= -400 -150, GAP_PENALTIES_2D = 0.5 0.5 0.5 0.5 0.5 6 WRITE_ALIGNMENT FILE = 'align2d.ali', ALIGNMENT_FORMAT = 'PIR', WRITE_ALIGNMENT FILE = 'align2d.pap', ALIGNMENT_FORMAT = 'PAP', ; ALIGNMENT_FEATURES='INDICES HELIX BETA STRAIGHTNESS ACCESSIBILITY CONSERVATION GAP1 GAP2' CHECK_ALIGNMENT # Color the first template structure according to gaps in alignment: READ_ALIGNMENT FILE = 'align2d.ali', ALIGN_CODES = '2ctx' '1nbt', ; ALIGNMENT_FORMAT = 'PIR', ADD_SEQUENCE = off, REMOVE_GAPS = on READ_MODEL MODEL_SEGMENT = '2ctx', FILE = '2ctx' COLOR_ALN_MODEL WRITE_MODEL FILE = '2ctx.aln.pdb' # Color the first template structure according to secondary structure: WRITE_DATA OUTPUT = 'SSM BISO_SSM', FILE = '2ctx' WRITE_MODEL FILE = '2ctx.ssm.pdb' # Superpose the target structure onto the first template: READ_MODEL2 FILE = '1nbt.pdb', MODEL2_SEGMENT = '1nbt' '1nbt' PICK_ATOMS ATOM_TYPES = 'CA' SUPERPOSE WRITE_MODEL2 FILE = '1nbt.fit.pdb' 62 CHAPTER 2. MODELLER COMMANDS 2.4.16 MALIGN _ align two or more sequences Options: RR_FILE = '$(LIB)/as1.sim.mat' input residue-residue scoring file DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') GAP_PENALTIES_1D = -400 -150 gap creation and extension penalties for se- quence/sequence alignment OFF_DIAGONAL = 100 to speed up the alignment MATRIX_OFFSET = 0.00 substitution matrix offset for local alignment OVERHANG = 0 un-penalized overhangs in protein comparisons LOCAL_ALIGNMENT = off whether to do local as opposed to global align- ment STOP_ON_ERROR = 1 whether to stop on error Output: MODELLER_STATUS = Description: This command performs a multiple sequence alignment. The sequences to be aligned are the se- quences in the current alignment arrays. The command uses the dynamic programming method for the best sequence alignment, given the gap initiation and extension penalties specified by GAP_PENALTIES_1D, and residue type weights read from file RR_FILE. See command ALIGN for more information. The algorithm for the multiple alignment is as follows. First, sequence 2 is aligned with sequence 1. Next, sequence 3 is aligned with an average of the aligned sequences 1 and 2; i.e., the weight matrix is an average of the weights 1-3 and 2-3. For this averaging, the gap-residue and gap-gap weights are obtained from the residue-residue weight matrix file, not from gap penalties. If the corresponding weights are not in the file, they are set to the worst and best residue-residue score, respectively. See instructions for ALIGN for more details. Example: # Example for: MALIGN # This will read all sequences from a file, align them, and write # the alignment to a new file: READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = 'all' MALIGN GAP_PENALTIES_1D= -600 -400 WRITE_ALIGNMENT FILE = 'toxin-seq.pap', ALIGNMENT_FORMAT = 'PAP' 2.4.17 ALIGN__CONSENSUS _ consensus sequence alignment Options: GAP_PENALTIES_1D = -400 -150 gap creation and extension penalties for se- quence/sequence alignment ALIGN_BLOCK = 1 the last sequence in the first block of sequences STOP_ON_ERROR = 1 whether to stop on error Output: MODELLER_STATUS = 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 63 Description: This command is similar to ALIGN except that a consensus alignment of two blocks of sequences is produced. A consensus alignment is obtained from a consensus similarity matrix using the specified gap penalties and the global dynamic programming method. The consensus similarity matrix is obtained by aligning the two blocks of sequences many times with different parameters and methods and counting how many times each pair was aligned. This command is still experimental and no detailed description is given at this time. This command also produces the alignment accuracy that can be printed out by the WRITE_ALIGNMENT command in the 'PAP' format (0 inaccurate, 9 accurate). If the gap initiation penalty is 0, the gap extension penalty of say 0.4 means that only those positions will be equivalenced that were aligned in at least 80% of the individual alignments (i.e., 2 times 0.40). Example: # Example for: ALIGN_CONSENSUS # This will read 2 sequences and prepare a consensus alignment # from many different pairwise alignments. READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '2ctx' '2abx' ALIGN_CONSENSUS GAP_PENALTIES_1D= 0 0.4, ALIGN_BLOCK = 1 WRITE_ALIGNMENT FILE = 'toxin-seq.pap', ALIGNMENT_FORMAT = 'PAP' 2.4.18 SUPERPOSE _ superpose MODEL2 on MODEL given alignment Options: ALIGN_CODES = 'all' codes of proteins in the alignment FIT = on whether to superpose SUPERPOSE_REFINE = off whether to refine the superposition RMS_CUTOFFS = 3.5 3.5 60 60 15 60 only the first element is used for calculating the 60 60 60 60 60 cutoff RMS and DRMS measures Requirements: MODEL & MODEL2 [& alignment] Description: This command superposes MODEL2 on MODEL, without changing the alignment. If an alignment is in memory, it is used to obtain the equivalent atoms. MODEL must be the first sequence in the alignment, MODEL2 must be the second sequence in the alignment. The equivalent atoms are those selected atoms (set 1) of the MODEL that have equivalently named atoms in MODEL2; the atom equivalences are defined in library $ATMEQV_LIB. Use the PICK_ATOMS command to select the desired atoms for superposition. By default, all atoms are selected. If there is no alignment, a 1:1 correspondence between the residues is assumed. No fitting is done if FIT = off. The ALIGN_CODES variable is used only for output, not in calculations. The RMS_CUTOFFS[1] element is the cutoff used in calculating the cutoff Rms deviations; i.e., those position and distance Rms deviations that are defined on the equivalent atoms which are less than RMS_CUTOFFS[1] angstroms away from each other (as superposed using all aligned positions) and those equivalent distances which are less than RMS_CUTOFFS[1] angstroms different from each other, respectively. If SUPERPOSE_REFINE is on the refinement of the superposition is done by repeating the fitting with only those aligned pairs of atoms that are within RMS_CUTOFFS[1] of each other until there is no change in the number of equivalent positions. This refinement can only remove compared positions, not add them like ALIGN3D can do. This is useful for comparing equivalent parts of two structures with a fixed alignment but omitting divergent parts from the superposition and Rms deviation calculation; e.g., comparing a model with the X-ray structure. 64 CHAPTER 2. MODELLER COMMANDS Example: # Example for: SUPERPOSE # This will use a given alignment to superpose Calpha atoms of # one structure (2ctx) on the other (1fas). READ_MODEL FILE = '1fas' READ_MODEL2 FILE = '2ctx' SET ALIGN_CODES = '1fas' '2ctx' READ_ALIGNMENT FILE = 'toxin.ali' PICK_ATOMS PICK_ATOMS_SET = 1, ATOM_TYPES = 'CA' SUPERPOSE WRITE_MODEL2 FILE = '1fas.fit' Example: # Example for: ALIGN3D, SUPERPOSE # This will align 3D structures of two proteins: # First example: read sequences from a sequence file: READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '1fas' '2ctx' ALIGN GAP_PENALTIES_1D= -600 -400 ALIGN3D GAP_PENALTIES_3D= 0 2.0 WRITE_ALIGNMENT FILE = 'toxin-str.ali' # Second example: read sequences from PDB files to eliminate the # need for the toxin.ali sequence file: READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' READ_MODEL FILE = '2ctx' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '2ctx', ; ALIGN_CODES = ALIGN_CODES '2ctx' ALIGN GAP_PENALTIES_1D= -600 -400 ALIGN3D GAP_PENALTIES_3D= 0 2.0 WRITE_ALIGNMENT FILE = 'toxin-str.ali' # And now superpose the two structures using current alignment to get # various RMS's: READ_MODEL FILE = '1fas' PICK_ATOMS ATOM_TYPES = 'CA' READ_MODEL2 FILE = '2ctx' SUPERPOSE FIT_ATOMS = 'CA' 2.4.19 COMPARE _ compare 3D structures given alignment Options: ALIGN_CODES = 'all' codes of proteins in the alignment ATOM_FILES = '' complete or partial atom filenames ATOM_FILES_DIRECTORY = './' input atom files directory list (e* *.g., 'dir1:dir2:dir3:./:/') 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 65 OUTPUT = 'LONG' selects output: 'SHORT' _ 'LONG'} _ 'RMS' * * _ 'DRMS' MATRIX_FILE = 'family.mat' the filename of the pairwise distance matrix COMPARE_MODE = 3 selects the type of comparison: 1 _ 2 _ 3 RMS_CUTOFFS = 3.5 3.5 60 60 15 60 cutoffs for RMS, DRMS, Alpha Phi Psi Omega 60 60 60 60 60 chi1 chi2 chi3 chi4 chi5 FIT_ATOMS = 'CA' whether to superpose before comparison DISTANCE_ATOMS = 'CA' 'CA' atom type used for variability calculations FIT = on whether to do pairwise least-squares fitting* * or ALIGN2D alignment ASGL_OUTPUT = off whether to write output for ASGL Description: This command compares the structures in the given alignment. It does not make an alignment, but it calculates the Rms and Drms deviations between atomic positions and distances, and class differences between the mainchain and sidechain dihedral angles. In contrast to the SUPERPOSE command, COMPARE works with a multiple alignment and it writes more information about the pairwise comparisons. If no alignment is available, it assumes a 1:1 correspondence for the proteins specified by ATOM_FILES or ALIGN_CODES. If ATOM_FILES is defined, it is used with the Modeller file-naming mechanism (Sec- tion 2.1.3) to find full names for the atom files. If it is not defined, ALIGN_CODES, which is usually set by the previous READ_ALIGNMENT command, is used. ALIGN_CODES does not have to be set if ATOM_FILES is set. OUTPUT selects short ('SHORT') or long ('LONG') form of output to the log file. If it contains word 'RMS' or 'DRMS' it also outputs the Rms or Drms deviation matrix to file MATRIX_FILE. This file can be used with the Phylip program or with the DENDROGRAM or PRINCIPAL_COMPONENTS commands of Modeller to calculate a clustering of the structures. COMPARE_MODE selects the form of the positional variability calculated for each position along the sequence: 1, for true Rms deviation over all proteins that have a residue at the current position. This does not make any sense for periodic quantities like dihedral angles. 2, for the average absolute distance over all pairs of residues that have a residue at the current position. 3, the same as 2 except that average distance, not its absolute value is used (convenient for comparison of 2 structures to get the sign of the changes for dihedral angles and distances). RMS_CUTOFFS specifies cutoff values for calculation of the position, distance, and dihedral angle Rms devia- tions for pairwise overall comparisons. If difference between two equivalent points is larger than cutoff it is not included in the Rms sum. The order of cutoffs in this vector is: atomic position, intra-molecular distance, ff, , , !, O1, O2, O3, O4, and O5 (there are 5 dihedrals in a disulfide bridge), where ff is the virtual C ffdihedral angle between four consecutive C ff atoms. These cutoffs do not affect positional variability calculations. FIT_ATOMS string specifies all the atom types (including possibly a generic 'ALL') to be fitted in the least- squares superposition. These atom types are used in the least-squares superposition, and in calculation of the position and distance Rms deviations. DISTANCE_ATOMS[1] specifies the atom type that is used for getting the average structure and Rms deviation at each alignment position in the Asgl output file 'posdif.asgl'. This Asgl file contains the positional variability of the selected atom type in the family of compared proteins. The Asgl output files can then be used with Asgl scripts 'posdif' and 'dih' to produce PostScript plots of the corresponding variabilities at each alignment position. ASGL_OUTPUT has to be on to obtain the Asgl output files. If FIT = on, a least-squares superposition is done before the comparisons; otherwise, the orientation of the molecules in the input atom files is used. Example: See MALIGN3D command. 66 CHAPTER 2. MODELLER COMMANDS 2.4.20 ALIGN3D _ align two structures Options: GAP_PENALTIES_3D = 0.0 1.75 gap creation and extension penalties for st* *ruc- ture/structure superposition FIT_ATOMS = 'CA' one atom type used for superposition FIT = on whether to align STOP_ON_ERROR = 1 whether to stop on error OUTPUT = 'LONG' 'SHORT' _ 'LONG' _ 'VERY_LONG' ALIGN3D_TRF = off whether to transform the distances before d* *y- namic programming ALIGN3D_REPEAT = off do several starts to maximize number of equiva- lent positions OFF_DIAGONAL = 100 to speed up the alignment MATRIX_OFFSET = 0.00 substitution matrix offset for local alignment OVERHANG = 0 un-penalized overhangs in protein comparisons LOCAL_ALIGNMENT = off whether to do local as opposed to global align- ment Output: MODELLER_STATUS = Description: This command uses the current alignment as the starting point for an iterative least-squares su- perposition of two 3D structures. This results in a new pairwise structural alignment. If no alignment is in memory, the initial alignment is the 1:1 alignment. A good initial alignment may be obtained by sequence alignment (ALIGN). For superpositions, only one atom per residue is used, as specified by FIT_ATOMS[1]. The alignment algorithm is as follows. First, structure 2 is least-squares fit on structure 1 using all the equivalent residue positions in the initial alignment that have the specified atom type. Next, the residue- residue distance matrix is obtained by calculating Euclidean distances between all pairs of selected atoms from the two structures. The alignment of the two structures is then obtained by the standard dynamic programming optimization based on the residue-residue distance matrix. GAP_PENALTIES_3D[1] is a gap creation penalty (usually 0), and GAP_PENALTIES_3D[2] is a gap extension penalty, say 1.75. This procedure identifies pairs of residues as equivalent when they have their selected atoms at most 2 times GAP_PENALTIES_3D[2] angstroms apart in the current orientation (this is so when the gap initiation penalty is 0). The reason is that an equivalence costs the distance between the two residues while an alternative, the gap-residue and residue-gap matches, costs twice the gap extension penalty. From the dynamic programming run, a new alignment is obtained. Thus, structure 2 can be fitted onto structure 1 again, using this new alignment, and the whole cycle is repeated until there is no change in the number of equivalent positions and until the difference in the rotation matrices for the last two superpositions is very small. At the end, the framework, that is the alignment positions without gaps, is written to the log file. If FIT is off, no alignment is done. If OUTPUT contains 'SHORT', only the best alignment and its summary are displayed. If OUTPUT contains 'LONG', summaries are displayed for all initial alignments in each framework cycle. If OUTPUT contains 'VERY_LONG', all alignments are displayed. If ALIGN3D_TRF is on, the weights in the weight matrix are modified distances [Subbiah et al., 1993]. If ALIGN3D_REPEAT is on, three additional initial alignments are tried and the one resulting in the largest number of equivalent positions is selected. Note that this alignment method and all other comparison methods in Modeller are different from the multi-feature structural comparison implemented in Comparer [Sali & Blundell, 1990]. Example: 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 67 # Example for: ALIGN3D, SUPERPOSE # This will align 3D structures of two proteins: # First example: read sequences from a sequence file: READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '1fas' '2ctx' ALIGN GAP_PENALTIES_1D= -600 -400 ALIGN3D GAP_PENALTIES_3D= 0 2.0 WRITE_ALIGNMENT FILE = 'toxin-str.ali' # Second example: read sequences from PDB files to eliminate the # need for the toxin.ali sequence file: READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' READ_MODEL FILE = '2ctx' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '2ctx', ; ALIGN_CODES = ALIGN_CODES '2ctx' ALIGN GAP_PENALTIES_1D= -600 -400 ALIGN3D GAP_PENALTIES_3D= 0 2.0 WRITE_ALIGNMENT FILE = 'toxin-str.ali' # And now superpose the two structures using current alignment to get # various RMS's: READ_MODEL FILE = '1fas' PICK_ATOMS ATOM_TYPES = 'CA' READ_MODEL2 FILE = '2ctx' SUPERPOSE FIT_ATOMS = 'CA' 2.4.21 MALIGN3D _ align two or more structures Options: ALIGN_CODES = 'all' codes of proteins in the alignment ATOM_FILES = '' complete or partial atom filenames ATOM_FILES_DIRECTORY = './' input atom files directory list (e* *.g., 'dir1:dir2:dir3:./:/') GAP_PENALTIES_3D = 0.0 1.75 gap creation and extension penalties for st* *ruc- ture/structure superposition OFF_DIAGONAL = 100 to speed up the alignment MATRIX_OFFSET = 0.00 substitution matrix offset for local alignment OVERHANG = 0 un-penalized overhangs in protein comparisons LOCAL_ALIGNMENT = off whether to do local as opposed to global align- ment FIT_ATOMS = 'CA' one atom type for superposition FIT = on whether to align OUTPUT = 'LONG' 'SHORT' _'LONG' _'VERY_LONG' _ 'NO_ALIGNMENT' WRITE_FIT = off whether to write out fitted coordinates to .fit f* *iles CURRENT_DIRECTORY = on whether to write output .fit files to current dir* *ec- tory WRITE_WHOLE_PDB = on whether to write out all lines in the input PDB file 68 CHAPTER 2. MODELLER COMMANDS STOP_ON_ERROR = 1 whether to stop on error Output: MODELLER_STATUS = Description: This command uses the current alignment as the starting point for an iterative least-squares super- position of two or more 3D structures. This results in a new multiple structural alignment. If no alignment is in memory, the initial alignment is the 1:1 alignment. A good initial alignment may be obtained by sequence alignment (MALIGN). For superpositions, only one atom per residue is used, as specified by FIT_ATOMS. The resulting alignment can be written to a file with the WRITE_ALIGNMENT command. The alignment algorithm is as follows. There are several cycles, each of which consists of an update of a framework and a calculation of a new alignment; the new alignment is based on the superposition of the structures onto the latest framework. The framework in each cycle is obtained as follows. The initial framework consists of the atoms in structure 1 that correspond to FIT_ATOMS, but only at the alignment positions where all the structures have a residue. If there is no specified atom types in any of the residues at a given position, the coordinates for this framework position are approximated by the neighboring coordinates. Next, all other structures are fit to this framework. The final framework for the current cycle is then obtained as an average of all the structures, in their fitted orientations, but only for residue positions that are common to all of them, given the current alignment. Another result is that all the structures are now superposed on this framework. Note that the alignment has not been changed yet. Next, the multiple alignment itself is re-derived in N - 1 dynamic programming runs, where N is the number of structures. This is done as follows. First, structure 2 is aligned with structure 1, using the inter-molecular atom-atom distance matrix, for all atoms of the selected type, as the weight matrix for the dynamic programming run. Next, structure 3 is aligned with an average of structures 1 and 2 using the same dynamic programming technique. Structure 4 is then aligned with an average of structures 1-3, and so on. Averages of structures i-j are calculated for all alignment positions where there is at least one residue in any of the structures i-j (this is different from a framework which requires that residues from all structures be present). Note that in this step, residues out of the current framework may get aligned and the current framework residues may get unaligned. Thus, after the series of N - 1 dynamic programming runs, a new multiple alignment is obtained. This is then used in the next cycle to obtain the next framework and the next alignment. The cycles are repeated until there is no change in the number of equivalent positions. This procedure is best viewed as a way to determine the framework regions, not the whole alignment. The results from this command are expected to be similar to the output of program Mnyfit [Sutcliffe et al., 1987]. GAP_PENALTIES_3D[1] is a gap creation penalty (usually 0), and GAP_PENALTIES_3D[2] is a gap extension penalty, say 1.75. This procedure identifies pairs of positions as equivalent when they have their selected atoms at most 2 times GAP_PENALTIES_3D[2] angstroms apart in the current superposition (this is so when the gap initiation penalty is 0), as described for the ALIGN3D command. Argument OUTPUT can contain the following values: o 'SHORT', only the final framework is written to the log file. o 'NO_ALIGNMENT', only the initial alignment is displayed in the ALIGN3D format. o 'LONG', the framework after the alignment stage in each cycle is written to the log file. o 'VERY_LONG', the framework from the framework stage in each cycle is also written to the log. o 'NO_ALIGNMENT', only the initial alignment is displayed in the MALIGN3D format; no alignment is done. If WRITE_FIT is on, the fitted atom files are written out in their final fitted orientations. Their filenames are the original filenames with an extension .fit. If CURRENT_DIRECTORY is on, the output .fit files will go to the current directory. Otherwise, the output will be in the directory with the original files. If WRITE_WHOLE_PDB is on, the whole PDB files are written out; otherwise only the parts corresponding to the aligned sequences are output. If FIT is off, the initial alignment is not changed. This is useful when all the structures have to be superim- posed with the initial alignment (FIT = off and WRITE_FIT = on). 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 69 Example: # Example for: MALIGN3D, COMPARE # This will read all sequences from a sequence file, multiply align # their 3D structures, and then also compare them using this alignment. READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = 'all' MALIGN GAP_PENALTIES_1D= -600 -400 MALIGN3D GAP_PENALTIES_3D= 0 2.0, WRITE_FIT = on, WRITE_WHOLE_PDB = off WRITE_ALIGNMENT FILE = 'toxin-str.pap', ALIGNMENT_FORMAT = 'PAP' # Make two comparisons: no cutoffs, and 3.5A/60 degree cutoffs for RMS, DRMS, # and dihedral angle comparisons: COMPARE RMS_CUTOFFS = 999 999 999 999 999 999 999 999 999 999 999 COMPARE RMS_CUTOFFS = 3.5 3.5 60 60 60 60 60 60 60 60 60 2.4.22 EXPAND__ALIGNMENT _ put all models into alignment Options: DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') ROOT_NAME = 'undf' root of a filename for filename construction FILE_ID = 'default' file id for filename construction ID1 = 1 ID1 for filename construction ID2 = 1 ID2 for filename construction FILE_EXT = '' file extension for filename construction Output: alignment Description: ID1, ID2, ROOT_NAME, FILE_EXT, and FILE_ID are used to construct atom filenames for all the models (Section 2.1.3). Next, all the models are added to the alignment, using the last sequence in the input alignment as the guide. This allows easy multiple superposition of all the templates and models after comparative modeling (provided there are less than 'MAXSEQ' proteins in total, so that all can fit into the alignment arrays). Example: # Example for: EXPAND_ALIGNMENT # This will add models to the alignment. READ_ALIGNMENT FILE = 'toxin.ali', ALIGN_CODES = '2ctx' '2abx' EXPAND_ALIGNMENT ID1 = 1, ID2 = 3, ROOT_NAME = '2abx', FILE_ID = '.B', ; FILE_EXT = '', DIRECTORY = '' WRITE_ALIGNMENT FILE = 'toxin-expand.ali' 2.4.23 SEQUENCE__SEARCH _ search for similar sequences Options: RR_FILE = '$(LIB)/as1.sim.mat' input residue-residue scoring file 70 CHAPTER 2. MODELLER COMMANDS FILE = 'default' file with the target sequence ALIGN_CODES = 'all' the code of the target sequence DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') GAP_PENALTIES_1D = -400 -150 gap creation and extension penalties for se- quence/sequence alignment OFF_DIAGONAL = 100 to speed up the alignment MATRIX_OFFSET = 0.00 substitution matrix offset for local alignment OVERHANG = 0 un-penalized overhangs in protein comparisons LOCAL_ALIGNMENT = off whether to do local as opposed to global align- ment SEARCH_CHAINS_LIST = $(LIB)/CHAINS_3.0_- file with sequences 30_XN.cod SEARCH_CHAINS_FILE = $(LIB)/CHAINS_3.0_- file with a list of sequence codes 30_XN.grp ALIGNMENT_FORMAT = 'PIR' sequence file formats; has to be 'PIR' ALIGNMENT_FEATURES = 'INDICES what alignment features to write out: 'ACCURACY' CONSERVATION' _ 'HELIX' _ 'BETA' _ 'ACCESSIBILITY' _ 'STRAIGHTNESS' _ 'CONSERVATION' _ 'INDICES' _ 'GAP1' _ 'GAP2' _ 'GAPS' REMOVE_GAPS = on whether to remove all-gap positions in input alig* *n- ment SEARCH_TOP_LIST = 20 the length of the output hits list OUTPUT = 'LONG' 'SHORT' _ 'LONG' STOP_ON_ERROR = 1 whether to stop on error SEARCH_SORT = 'LONGER' which sequence to use for normalization when sorting the hit list: 'SHORTER' _ 'LONGER' SEARCH_RANDOMIZATIONS = 0 number of randomizations for calculating the sig- nificance of a sequence/sequence similarity RAND_SEED = -8123 random seed from -50000 to -2 FAST_SEARCH = off whether to use fast sequence search or not FAST_SEARCH_CUTOFF = 1.0 if FAST_SEARCH is ON only sequences with database scan significance higher than this value are considered for randomization significance DATA_FILE = off whether results go to a separate file or not Output: MODELLER_STATUS = Description: This command searches a sequence database for proteins that are similar to a given target sequence. Target sequence is read from file FILE. ALIGN_CODES specifies the code of the target sequence in the FILE file. If only one sequence is in the file, you can use ALIGN_CODES = 'all' to read it without bothering about the actual sequence code. SEARCH_CHAINS_LIST specifies a file that contains protein codes for the proteins to be compared with the target sequence. The database sequences specified in SEARCH_CHAINS_LIST file must occur in the SEARCH_CHAINS_FILE file. The command uses the dynamic programming method for the best sequence alignment, given the gap cre- ation and extension penalties specified by GAP_PENALTIES_1D and residue type scores read from file RR_FILE. GAP_PENALTIES_1D[1] is a gap creation penalty (usually 2 for the identity distance matrix 'modlib/id.mat'), and GAP_PENALTIES_1D[2] is a gap extension penalty (usually 1 for the identity distance matrix). The SEARCH_TOP_LIST top hits are written to the log file at the end. The hits are sorted according to the fractional sequence identity score obtained by dividing the number of identical residue pairs by the length of the longer sequence (SEARCH_SORT = 'LONGER') or the shorter sequence (SEARCH_SORT = 'SHORTER'). 2.4. COMPARISON AND SEARCHING OF SEQUENCES AND STRUCTURES 71 The final list of hits contains three different significance values: 1. SIGNI. Z-score from sequence randomizations. This is the most accurate significance score, but the slowest one to calculate. For each pairwise comparison, the two sequences are shuffled a specified number of times (SEARCH_RANDOMIZATIONS) to obtain the mean and standard deviation of "random" scores from which the Z-score for an alignment score of a given pair of sequences is calculated. 2. SIGNI2. Z-score for sequence identity from the database scan. After comparison of the target sequence with all sequences in the database is done, the comparisons are sorted by the length of the database sequence. The pairwise sequence identities of the 20 sequences closest in length to the target sequence are used to calculate the average and standard deviation of the percentage sequence identities for subsequent calculation of the Z-score for the percentage sequence identity of a given pairwise alignment. 3. SIGNI3. Z-score for alignment score from the database scan. The procedure is the same as for SIGNI2, except that the alignment scores are used instead of the pairwise sequence identities. SEARCH_RANDOMIZATIONS specifies how many alignments of the shuffled sequences are done to calculate the significance score for the overall sequence similarity. If 0, the significance is not calculated. If more than 5 randomizations are done, the significance score, not sequence identity, is used for sorting the hit list. When FAST_SEARCH is on only those sequences that have a database-scan alignment score significance (SIGNI3 in output) above FAST_SEARCH_CUTOFF are used for the "full" randomization-based significance calculation. Since the mean and the standard deviation of the distribution obtained by randomizing the two compared sequences are much more appropriate than the corresponding quantities for the target/database comparisons, FAST_SEARCH should be on only when you are in a hurry and the database is large. If DATA_FILE is on the final results (list of PDB codes with significances, etc .) are also written to a separate file 'seqsearch.dat'. If OUTPUT is 'LONG', the best alignment for each sequence in SEARCH_CHAINS_FILE and its various scores are also written to the log file. If OUTPUT is 'VERY_LONG', individual scores obtained for randomized sequences are also written to the log file (this is almost never needed). Example: # Example for: SEQUENCE_SEARCH # This will search the MODELLER database of representative protein chains # for chains similar to the specified sequence. SET SEARCH_RANDOMIZATIONS = 20 # should use 100 in real life; SET GAP_PENALTIES_1D = -800 -400 # SET SEARCH_CHAINS_LIST = 'junk.cod' SEQUENCE_SEARCH FILE = 'toxin.ali', ALIGN_CODES = '1fas' 2.4.24 DELETE__ALIGNMENT _ delete alignment Description: This command deletes an existing alignment from the Modeller memory. This is useful when a default 1:1 correspondence, such as that between an X-ray structure and its Modeller model, is needed. This default alignment is constructed for the commands that need an alignment only if there is no alignment already in memory. Example: See PATCH command. 72 CHAPTER 2. MODELLER COMMANDS 2.5 Calculation of spatial restraints This Chapter explains how the restraints are represented in a restraint file and also describes commands for reading, writing, generating, and manipulating restraints. See Section 6.3 for equations defining the restraints and their derivatives with respect to atomic positions. See Section 2.6 for commands for calculating the objective function and Section 6.2 for optimization methods. See the original papers for the most detailed definition and description of the restraints [Sali & Blundell, 1993, Sali & Overington, 1994]. 2.5.1 Specification of restraints Static and dynamic restraints Static restraints are read from the restraints file or are generated by the MAKE_RESTRAINTS command. All other restraints are dynamic restraints and are created on the fly; they currently include only restraints on non-bonded atom pairs. Formats of the restraints file Restraints may be read from a restraints file in two formats, MODELLER or USER. The files in the MODELLER and USER formats have to begin with the lines `MODELLER12 VERSION: MODELLER FORMAT' and `MOD- ELLER12 VERSION: USER FORMAT', respectively. In both formats, there is one entry per line. The format is free, except that the first character has to be at the beginning of the line. There are three different entry types in the MODELLER format: R Form Modality Feature Group Numb_atoms Numb_parameters 0 Atom_indices Parameters E Atom_index_1 Atom_index_2 P Pseudo_atom_index Pseudo_atom_type Numb_real_atoms Real_atom_indices For example, R 3 1 1 1 2 2 0 437 28 1.5000 0.1000 E 120 540 P 1 3 3 120 121 122 When the line starts with 'R', it contains a restraint, 'E' indicates a pair of atoms to be excluded from the calculation of the dynamic non-bonded pairs list, and 'P' indicates a pseudo atom definition (not documented further). The USER format recognizes only the R entries. The fields of a line in the USER format are: Id Form Modality Feature Group Numb_atoms Numb_parameters 0 Parameters Atom_ids For example, R 3 1 1 1 2 2 0 1.5000 0.1000 NH#:1:A CA:2:A The seven integer indices used to specify various restraint properties are listed in Tables 2.2-2.4. They are: Form specifies the mathematical form of the restraint. Modality should be viewed as the argument to Form. It specifies the number of single Gaussians in a poly-Gaussian pdf, periodicity n of the cosine in the cosine potential, and the number of spline points for cubic splines. Only certain combinations of Form and Modality are possible. Any Feature can be used with any Form/Modality pair. Group or `physical feature type' groups restraints for reporting purposes in ENERGY, ENERGY_PROFILE, etc . The number of atoms and parameters for the restraint are specified by Numb_atoms and Numb_prms, respectively. The seventh integer index can be ignored. Atom_indices and Parameters have to match the hard-wired conventions. The format of the atom id is ATOM_NAME:RESIDUE_#[:CHAIN_ID], where ATOM_NAME is the four character IUPAC atom name as found in a PDB file, RESIDUE_# is a five character residue number as it occurs in the PDB file of a model, and the optional CHAIN_ID is the single character chain id as it occurs in the PDB file. For example, the carbonyl oxygen (O) in residue `10A' in chain `A' is specified by `O:10A:A'; if the chain has no chain id, the name would be only 'O:10A'. 2.5. CALCULATION OF SPATIAL RESTRAINTS 73 _______________________________________________________________* *_________________________@ __#_____Form___________________________________________Paramete* *rs_______________________@ 1 left Gaussian (harmonic lower bound) f ; oe * * @ 2 right Gaussian (harmonic upper bound) f ; oe * * @ 3 single Gaussian (harmonic potential) f ; oe * * @ 4 multiple Gaussian (!i)n ; * *(fi)n ; (oei)n @ 5 Lennard-Jones potential A; B * * @ 6 Coulomb point-to-point potential q1; q2 * * @ 7 Cosine potential a; b * * @ 8 undefined * * @ * * @ 9 multiple binormal (!i)n ; * *(f1i; f2i)n ; (oe1i; oe2i@ __10____cubic_spline___________________________________pi,_for_* *i_=_1;_6_+_2n____________@ Table 2.2: List of mathematical forms of restraints. The parame* *ters and their order in t@ (: :):is repeated n times, where n is specified by the second i* *nteger parameter of the r@ the cosine restraint, corresponding to parameter n in Eq 6.56, * *and the number of interpo@ generally be either a distance, an angle, or a dihedral angle, * *with the exception of res@ unit in the restraints file is radians. The internal angle uni* *t of Modeller is ra@ in PICK_HOT_ATOMS. For cubic splines, fmin is the feature valu* *e that results in the sma@ of the Gaussian function fitted locally around fmin . The para* *meters pi for a spline re@ interpolation is done, x1 (p2), the largest interpolating valu* *e xn (p3), the interval @ the first derivative at xn (p6). The following n values are t* *he values of the restrain@ derivatives of the restraint at the interpolating xi points. * * _______________________@ 74 CHAPTER 2. MODELLER COMMANDS _________________________________________________________________________________ __Index_____Feature______________________________________________________________ 1 distance 2 angle 3 dihedral angle 4 a pair of dihedral angles (points 1-4 and 5-8) 5 distance between gravity centers of two groups of atoms _______6____minimal_distance_between_several_pairs_of_atoms______________________ Table_2.3:_List_of_feature_types_that_can_be_restrained.____ _________________________________________________________________ __Index_____Group________________________________________________ 1 Bond length potential 2 Bond angle potential 3 Stereochemical cosine dihedral potential 4 Stereochemical improper dihedral potential 5 soft-sphere overlap restraints 6 Lennard-Jones 6-12 potential 7 Coulomb point-point electrostatic potential 8 H-bonding potential 9 Distance restraints 1 (C ff-C ff) 10 Distance restraints 2 (N-O) 11 Mainchain dihedral restraints 12 Mainchain dihedral restraints 13 Mainchain ! dihedral restraints 14 Sidechain O1 dihedral restraints 15 Sidechain O2 dihedral restraints 16 Sidechain O3 dihedral restraints 17 Sidechain O4 dihedral restraints 18 Disulfide distance restraints 19 Disulfide angle restraints 20 Disulfide dihedral angle restraints 21 X lower bound distance restraints 22 X upper bound distance restraints 23 Distance restraints 3 (SDCH-MNCH) 24 Sidechain O5 dihedral restraints 25 (; ) binomial dihedral restraints 26 Distance restraints 4 (SDCH-SDCH) 27 Distance restraints 5 (X-Y) 28 NMR distance restraints 6 (X-Y) 29 NMR distance restraints 7 (X-Y) 30 Minimal distance restraints ______31____Non-bonded_spline_restraints_________________________ __Table_2.4:_List_of_`physical'_restraint_types._______ 2.5. CALCULATION OF SPATIAL RESTRAINTS 75 2.5.2 MAKE__RESTRAINTS _ make restraints Options: RESTRAINT_TYPE = 'STEREO' restraint type to be calculated: 'STEREO' _ 'BOND' _ 'ANGLE' _ 'IMPROPER' _ 'DIHEDRAL' _ 'MRFP_STEREO' _ 'MRFP_BOND' _ 'MRFP_ANGLE' _ 'MRFP_DIHEDRAL' _ 'SPHERE' _ 'SPHERE14' _ 'LJ' _ 'LJ14' _ 'COULOMB' _ 'COULOMB14' _ 'ALPHA' _ 'STRAND' _ 'SHEET' _ 'DISTANCE' _ 'USER_DISTANCE' _ 'PHI-PSI_BINORMAL' _ 'PHI-PSI_CLASS' _ 'PHI_DIHEDRAL' _ 'PSI_DIHEDRAL' _ 'OMEGA_DIHEDRAL' _ 'CHI1_DIHEDRAL' _ 'CHI2_DIHEDRAL' _ 'CHI3_DIHEDRAL' _ 'CHI4_DIHEDRAL' RADII_FACTOR = 0.82 factor for van der Waals radii TOPOLOGY_MODEL = 3 selects topology library: 1-9 INTERSEGMENT = on whether to restrain inter-segment non-bonded pairs ADD_RESTRAINTS = off whether to add new restraints to existing * *re- straints RESIDUE_GROUPING = 1 MAXIMAL_DISTANCE = 999. maximal distance for distance restraints RESIDUE_SPAN_RANGE = 0 99999 range of residues spanning the allowed distances; for MAKE_RESTRAINTS, PICK_RESTRAINTS, non-bonded dynamic pairs ACCESSIBILITY_TYPE = 8 type of solvent accessibility: 1-10 DISTANCE_RSR_MODEL = 1 the model for calculating distance restraints: 1-7 RESTRAINT_STDEV = 0.0 1.0 transforming factors for standard deviations (y=a+bx) in models 1-6 or standard deviation for model 7 (a) RESTRAINT_PARAMETERS = 3 1 3 3 4 2 0 0.0 restraint parameters for 'USER_DISTANCE' 0.087 MDT_LIB_FILE = 'mnch1.mdt' file with probability distributions for restraints BIN_LIB_FILE = '$(LIB)/mdt.bin' file with bin definitions for restraints ATOM_FILES_DIRECTORY = './' input atom files directory list (e* *.g., 'dir1:dir2:dir3:./:/') BASIS_PDF_WEIGHT = 'LOCAL' a method for calculation of basis pdf weig* *hts: 'LOCAL' _ 'GLOBAL' BASIS_RELATIVE_WEIGHT = 0.05 the cutoff weight of basis pdf's for their removal RESIDUE_IDS = '' residue id (number:chnid) SPLINE_ON_SITE = off whether to convert restraints to splines Requirements: topology & parameters [& alignment] [& picked atoms sets 2 and 3] Description: This command calculates and selects new restraints of a specified type. See the original papers for the most detailed definition and description of the restraints [Sali & Blundell, 1993, Sali & Overington, 1994]. If ADD_RESTRAINTS is off, all old restraints are deleted, otherwise new restraints are added to the old ones. RESTRAINT_TYPE selects the types of the generated restraints. Only one restraint type can be selected at a time, except for the stereochemical restraints (BOND, ANGLE, DIHEDRAL, IMPROPER) that can all be calculated at the same time. It is useful to distinguish between the stereochemical restraints and homology- derived restraints. The stereochemical restraints are obtained from the Charmm 22 force field [Brooks et al., 76 CHAPTER 2. MODELLER COMMANDS 1983] and do not require an alignment with template structures. In contrast, the homology-derived restraints are calculated from related protein structures (with the exception of some distance restraints). They are obtained for the last sequence in the alignment, using 3D structures all the other aligned proteins. These templates must have accessible coordinate files, which are the only data files required. o o Stereochemical restraints: o 'BOND'. This calculates covalent bond restraints (harmonic terms). It relies on the list of the atom- atom bonds for MODEL, prepared previously by the GENERATE_TOPOLOGY command. The mean values and force constants are obtained from the parameter library in memory. o 'ANGLE'. This calculates covalent angle restraints (harmonic terms). It relies on the list of the atom- atom-atom bonds for MODEL, prepared previously by the GENERATE_TOPOLOGY command. The mean values and force constants are obtained from the parameter library in memory. o 'DIHEDRAL'. This calculates covalent dihedral angle restraints (cosine terms). It relies on the list of the atom-atom-atom-atom dihedral angles for MODEL, prepared previously by the GENERATE_- TOPOLOGY command. The minima, phases, and force constants are obtained from the parameter library in memory. o 'IMPROPER'. This calculates improper dihedral angle restraints (harmonic terms). It relies on the list of the improper dihedral angles for MODEL, prepared previously by the GENERATE_TOPOLOGY command. The mean values and force constants are obtained from the parameter library in memory. o 'STEREO'. This implies all 'BOND', 'ANGLE', 'DIHEDRAL', and 'IMPROPER' restraints. o 'MRFP_BOND'. Similar to 'BOND' except that spline restraints from the corresponding MRFP entries in the parameter library are used instead of the harmonic terms. o 'MRFP_ANGLE'. Similar to 'ANGLE' except that spline restraints from the corresponding MRFP entries in the parameter library are used instead of the harmonic terms. o 'MRFP_DIHEDRAL'. Similar to 'DIHEDRAL' except that spline restraints from the corresponding MRFP entries in the parameter library are used instead of the cosine terms. o 'MRFP_STEREO'. This implies all 'MRFP_BOND', 'MRFP_ANGLE', and 'MRFP_DIHEDRAL' restraints. o 'SPHERE14'. This constructs soft-sphere overlap restraints (lower harmonic bounds) for atom pairs separated by exactly three bonds (1-4 pairs). It relies on atom radii from the '$RADII14_LIB' library. o 'LJ14'. This constructs 1-4 Lennard-Jones restraints using the modified 1-4 Lennard-Jones parameters from the Charmm parameter library. There is no way to calculate 'LJ14' as dynamic restraints. o 'COULOMB14'. This constructs 1-4 Coulomb restraints by relying on the atomic charges from the Charmm topology library. There is no way to calculate 'COULOMB14' as dynamic restraints. o 'SPHERE'. This constructs soft-sphere overlap restraints (lower harmonic bounds) for all atom pairs that are not in bonds, angles, dihedral angles, improper dihedral angles, nor are explicitly excluded by the 'E' entries read from a restraint file or added by the ADD_RESTRAINT command. Note that this makes these restraints static (i.e., not dynamic) and that you must set DYNAMIC_SPHERE to off before evaluating the molecular pdf if you want to avoid duplicated restraints. These restraints should usually not be combined with the Lennard-Jones ('LJ') restraints. o 'LJ'. This constructs Lennard-Jones restraints for all atom pairs that are not in bonds, angles, dihedral angles, improper dihedral angles, nor are explicitly excluded by the 'E' entries read from a restraint file or added by the ADD_RESTRAINT command. Note that this makes these restraints static (i.e., not dynamic) and that you must set DYNAMIC_LENNARD to off before evaluating the molecular pdf if you want to avoid duplicated restraints. Note that Charmm uses both 'LJ14' and 'LJ'. For large molecules, it is better to calculate 'LJ' as dynamic restraints because you can use distance cutoff CONTACT_SHELL in OPTIMIZE to reduce significantly the number of non-bonded atom pairs. o 'COULOMB'. This constructs Coulomb restraints for all atom pairs that are not in bonds, angles, dihedral angles, improper dihedral angles, nor are explicitly excluded by the 'E' entries read from a restraint file or added by the ADD_RESTRAINT command. Note that this makes these restraints static (i.e., not dynamic) and that you must set DYNAMIC_COULOMB to off before evaluating the molecular pdf if you want to avoid duplicated restraints. Note that Charmm uses both 'COULOMB14' and 'COULOMB'. For large molecules, it is better to calculate 'COULOMB' as dynamic restraints because you can use distance cutoff CONTACT_SHELL in OPTIMIZE to reduce significantly the number of non-bonded atom pairs. 2.5. CALCULATION OF SPATIAL RESTRAINTS 77 o 'ALPHA'. This makes restraints enforcing an ff-helix (mainchain conformation class `A') for the residue segment specified by the two RESIDUE_IDS (Section 2.4.1). The helix is restrained by ; binormal restraints, N-O hydrogen bonds, C ff-C ff distances for i - j 2 {2 - 9}, C ff-O distances for i - j 2 {2 - 9}, and O-O distances for i - j 2 {2 - 6}. These target distances were all obtained from a regular ff-helix in one of the high-resolution myoglobin structures. A convenient way to add 'HELIX', 'STRAND', or 'SHEET' restraints to the calculation by the 'model' script is to include them in the special_restraints routine (Chapter 4, Question 20). Note that at least the non-hydrogen mainchain atoms topology model is required although the same functionality could also be provided for the C ff-only topology with small changes to the source code. o 'STRAND'. This makes restraints enforcing an extended strand conformation for the residue segment specified by the two RESIDUE_IDS (Section 2.4.1). This is achieved by applying ; binormal restraints only. These binormal restraints force the mainchain conformation into class `B', except for the Pro residues which are restrained to class `P' [Sali & Blundell, 1993]. o 'SHEET'. This calculates H-bonding restraints for a pair of fi-strands. ATOM_IDS specifies the two atom identifiers (Section 2.5.1) defining the first H-bond in the fi-sheet ladder. SHEET_H-BONDS specifies the number of H-bonds to be added. The parallel and anti-parallel sheets are selected by a positive and negative integer in SHEET_H-BONDS, respectively. In a parallel sheet, hydrogen bonds start at the first or the second term of the following series (depending on ATOM_IDS): 1N:1O, 1O:3N, 3N:3O, 3O:5N, etc. For an anti-parallel sheet, the corresponding series is 1N:3O, 1O:3N, 3N:1O, 3O:1N, etc ; note that the residue indices are always decreasing for the second strand. The extended structure of the individual strands must be enforced separately by the 'STRAND' restraints if so desired. o o Homology derived restraints: o 'DISTANCE'. This makes distance restraints that are generated for all pairs of atoms i; j where atom i is from selected set 2 and atom j is from selected set 3 (as defined by the PICK_ATOMS command). The atoms also have to be within the residue spanning range specified by RESIDUE_SPAN_RANGE = r1 r2, such that the residue index difference r1 |ir2 - ir1| r2 when RESIDUE_SPAN_SIGN = off and r1 (ir2 - ir1) r2 when RESIDUE_SPAN_SIGN = on. Moreover, for a restraint to be created, at least one distance in the template structures must be less than MAXIMAL_DISTANCE (in A ). RESTRAINT_STDEV = a b specifies the linear transformation for the first six standard deviation models (oe0 = a + b * oe). These six models are polynomials and depend on several structural features of the template and its similarity to the target. When the seventh model is selected, its standard deviation is a constant equal to a. Each basis pdf in the distance pdf corresponds to one template structure with an equivalent distance. The mean of this basis pdf is equal to the template distance and its standard deviation is calculated from an analytic model specified by DISTANCE_RSR_MODEL. Use model 5 for C ff-C ff distances and model 6 for N-O distances. The weights of basis pdf's depend on local sequence similarity between the target and the templates when BASIS_PDF_WEIGHT = 'LOCAL' and on global sequence identity when BASIS_PDF_WEIGHT = 'GLOBAL'. o 'USER_DISTANCE'. This makes distance restraints between pairs of atoms from set 2 and 3 (inter-set only), using the value of RESTRAINT_PARAMETERS. Only distances satisfying the RESIDUE_SPAN_RANGE criterion are restrained. This command is useful for making non-specific `compactization' restraints. o 'PHI-PSI_CLASS', 'CHI1_DIHEDRAL', 'CHI2_DIHEDRAL', 'CHI3_DIHEDRAL', 'CHI4_DIHEDRAL', 'PHI_- DIHEDRAL', 'PSI_DIHEDRAL', 'OMEGA_DIHEDRAL', 'PHI-PSI_BINORMAL' are the mainchain and sidechain dihedral angle restraints. The means and standard deviations for the dihedral Gaussian restraints are obtained from the $RESDIH_LIB and $MNCH_LIB libraries and their weights from the MDT tables, specified by MDT_LIB_FILE and BIN_LIB_FILE. The large MDT tables give the conditional weights for each possible dihedral angle class, as a function of all possible combinations of features on which a particular class depends. If there is no equivalent residue in any of the templates, the weights for the dihedral angle classes depend only on the residue type and are obtained from the '$RESDIH_LIB' and '$MNCH_LIB' libraries. BASIS_PDF_WEIGHT has the same effect as for the distance pdf's. MDT_LIB_FILE and BIN_LIB_FILE have to be specified for all homology-derived restraints that depend on the MDT files, including all mainchain and sidechain dihedral angle restraints. BASIS_RELATIVE_WEIGHT is the cutoff for removing weak basis pdf's from poly-Gaussian feature pdf's: a basis pdf whose weight is less than the BASIS_RELATIVE_WEIGHT fraction of the largest weight is deleted. 78 CHAPTER 2. MODELLER COMMANDS Example: # Example for: MAKE_RESTRAINTS, SPLINE_RESTRAINTS, WRITE_RESTRAINTS # This will compare energies of bond length restraints expressed # by harmonic potential and by cubic spline. READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' READ_MODEL FILE = '1fas', MODEL_SEGMENT = '1:' '61:' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off WRITE_MODEL FILE '1fas.ini' MAKE_RESTRAINTS RESTRAINT_TYPE = 'bond' WRITE_RESTRAINTS FILE = '1fas-1.rsr' ENERGY DYNAMIC_SPHERE = off SPLINE_RESTRAINTS SPLINE_RANGE = 5.0, SPLINE_DX = 0.005, SPLINE_SELECT = 3 1 1 CONDENSE_RESTRAINTS WRITE_RESTRAINTS FILE = '1fas-2.rsr' ENERGY 2.5.3 DEFINE__SYMMETRY _ define similar segments Options: SYMMETRY_WEIGHT = 1.0 the weight of the symmetry objective functi* *on term ADD_SYMMETRY = off on whether to add segment pair, add atoms to seg- ment pair Description: This command allows defining pairs of segments that will be restrained to be the same during optimization of the objective function. This is achieved by adding the sum of squares of the differences between the equivalent distances (similar to distance Rms deviation) to the objective function being optimized, separately for each pair of segments defined by DEFINE_SYMMETRY. The value of this term is reported in the log file by the ENERGY command, which also reports the individual contributions to the term when OUTPUT contains word 'SYMMETRY'. In each call of the DEFINE_SYMMETRY command, the list of such segments is either initiated, extended by a new pair of segments, or the last defined pair of segments is extended by adding new atoms. SYMMETRY_WEIGHT specifies the atomic weights to be used in the calculation of the symmetry term (Eq. 6.71). The two segments correspond to the selected sets 2 and 3 (obtained by the PICK_RESTRAINTS com- mand). They must have the same number of atoms. A pair of segments can be either added to the list (ADD_SYMMETRY[1] = on) or the list can be initialized (ADD_SYMMETRY[1] = off). If ADD_SYMMETRY[2] = on, the currently selected atoms are added to the last segment pair in the segment pairs list, otherwise a new segment pair is started. 2.5. CALCULATION OF SPATIAL RESTRAINTS 79 Example: # Example for: DEFINE_SYMMETRY # This will force two copies of 1fas to have similar mainchain # conformation. DEFINE_STRING VARIABLES = SEG1 SEG2 READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' # Generate two copies of a segment: READ_MODEL FILE = '2abx', MODEL_SEGMENT = '1:A' '74:B' SEQUENCE_TO_ALI ATOM_FILES = '2abx', ALIGN_CODES = '2abx' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '2abx.ini', ; ALIGN_CODES = ALIGN_CODES '2abx-ini' GENERATE_TOPOLOGY SEQUENCE = '2abx-ini' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off RENAME_SEGMENTS SEGMENT_IDS = 'A' 'B' RANDOMIZE_XYZ DEVIATION = 6.0 # Define the two segments (chains in this case) to be identical: CALL ROUTINE = 'defsym', SEG1 = '1:A' '74:A', SEG2 = '1:B' '74:B' # Make them identical by optimizing the initial randomized structure # without any other restraints: SET DYNAMIC_SPHERE = off ENERGY WRITE_MODEL FILE = 'define_symmetry-1.atm' OPTIMIZE MAX_ITERATIONS = 300 WRITE_MODEL FILE = 'define_symmetry-2.atm' ENERGY # Now optimize with stereochemical restraints so that the # result is not so distorted a structure (still distorted # because optimization is not thorough): SET DYNAMIC_SPHERE = on MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' RANDOMIZE_XYZ DEVIATION = 3.0 SET MAX_ITERATIONS = 300, MD_RETURN = 'FINAL' OPTIMIZE OPTIMIZATION_METHOD = 1 # Conjugate gradients OPTIMIZE OPTIMIZATION_METHOD = 3 # Molecular dynamics OPTIMIZE OPTIMIZATION_METHOD = 1 # Conjugate gradients WRITE_MODEL FILE = 'define_symmetry-3.atm' ENERGY DELETE_ALIGNMENT READ_MODEL MODEL_SEGMENT = '1:A' '74:A' READ_MODEL2 MODEL2_SEGMENT = '1:B' '74:B' PICK_ATOMS ATOM_TYPES = 'MNCH' SUPERPOSE STOP SUBROUTINE ROUTINE = 'defsym' 80 CHAPTER 2. MODELLER COMMANDS SET ATOM_TYPES = 'MNCH' SET SELECTION_STATUS = 'INITIALIZE' SET SELECTION_SEARCH = 'SEGMENT' SET SYMMETRY_WEIGHT = 1.0 PICK_ATOMS PICK_ATOMS_SET = 2, SELECTION_SEGMENT = SEG1 PICK_ATOMS PICK_ATOMS_SET = 3, SELECTION_SEGMENT = SEG2 DEFINE_SYMMETRY ADD_SYMMETRY = on off RETURN END_SUBROUTINE 2.5.4 PICK__RESTRAINTS _ pick restraints for selected atoms Options: SCHEDULE_STEP = 1 schedule step for optimization N_SCHEDULE = 1 the number of steps in the optimization schedule RESTRAINTS_FILTER = -999 -999 -999 -999 keep restraints? -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 KEEP_RESTRAINTS = 'ONE_ATOM' what static restraints to keep: 'ALL_ATOMS'* * _ 'ONE_ATOM' ADD_RESTRAINTS = off whether to add new restraints to existing * *re- straints Description: This command selects some or all of the restraints currently in memory. If ADD_RESTRAINTS is on, the already selected restraints remain selected; additional restraints also become selected if they satisfy currently specified conditions. If ADD_RESTRAINTS is off, only those restraints that satisfy currently specified conditions become selected. SCHEDULE_STEP specifies which of the current N_SCHEDULE steps in the variable target function schedule is going to be used for the selection of restraints. The pre-defined N_SCHEDULE variable is used to check that SCHEDULE_STEP satisfies 1 SCHEDULE_STEP N_SCHEDULE . Non-default schedule steps can be calculated by MAKE_SCHEDULE or read from a file by READ_SCHEDULE. Each schedule step specifies the restraint scaling factors, the optimization method (the equivalent of OPTIMIZATION_METHOD, Section 2.6.6), and the second integer of the residue span range (the equivalent of RESIDUE_SPAN_RANGE[2], Section 2.5.2). The first residue span integer is assumed to be 0 and the equivalent of the RESIDUE_SPAN_SIGN is assumed to be off (Section 2.5.2). If MAKE_SCHEDULE or READ_SCHEDULE have not been executed, there is a default schedule step with all scaling factors equal to 1 and a very large residue span range. A static restraint is selected if at least one (KEEP_RESTRAINTS = 'ONE_ATOM'; default) or all (KEEP_- RESTRAINTS = 'ALL_ATOMS') of its atoms are selected (set 1), if it is strong enough based on its standard deviations or force constants (see the next paragraph), and if it does not span more than the maximal allowed number of residues (both specified in the current schedule step as described above). To decide if a restraint is strong enough, standard deviations or force constants are compared with the cor- responding RESTRAINTS_FILTER[physical_restraint_type]. A harmonic restraint, lower and upper bounds, and multi-modal Gaussian restraints are selected if the (smallest) standard deviation is less than the cor- responding RESTRAINTS_FILTER[i]. The cosine energy term is selected if its force constant is larger than 2.5. CALCULATION OF SPATIAL RESTRAINTS 81 the corresponding RESTRAINTS_FILTER[i]. If RESTRAINTS_FILTER[i] = -999, a restraint of type i is al- ways selected. Restraints of the other physical_restraint_types are always selected (Coulomb, Lennard-Jones, binormal, and spline). The RESTRAINTS_FILTER angles have to be specified in radians. Example: # Example for: PICK_RESTRAINTS, CONDENSE_RESTRAINTS # This will pick only restraints that include at least one # CA atom and write them to a file. READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' ENERGY PICK_ATOMS ATOM_TYPES = 'CA' PICK_RESTRAINTS ADD_RESTRAINTS = off, KEEP_RESTRAINTS = 'ONE_ATOM' # Delete the unselected restraints from memory: CONDENSE_RESTRAINTS ENERGY WRITE_RESTRAINTS FILE = '1fas.rsr' 2.5.5 CONDENSE__RESTRAINTS _ remove unselected restraints Description: This command removes all the unselected restraints from memory. Example: See READ_MODEL command. 2.5.6 ADD__RESTRAINT _ add restraint Options: ATOM_IDS = '' atom ids: atom:residue_id[:chain_id] RESTRAINT_PARAMETERS = 3 1 3 3 4 2 0 0.0 restraint parameters 0.087 Description: This command adds a specified restraint to the end of the restraints list and selects it. This command is useful for specifying cis-peptide bonds from a Top script. The angles have to be in radians. Example: # Example for: ADD_RESTRAINT, DELETE_RESTRAINT # This will enforce cis conformation for Pro-56. 82 CHAPTER 2. MODELLER COMMANDS # Make a model and stereochemical restraints: DEFINE_STRING VARIABLES = ATOM_IDS1 ATOM_IDS2 READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' # Change the Pro-56 restraint from trans to cis: CALL ROUTINE = 'cispeptide', ATOM_IDS1 = 'O:56' 'C:56' 'N:57' 'CA:57', ; ATOM_IDS2 = 'CA:56' 'C:56' 'N:57' 'CA:57' WRITE_RESTRAINTS FILE = '1fas.rsr' ENERGY SUBROUTINE ROUTINE = 'cispeptide' # Delete the old restraint on the same atoms: DELETE_RESTRAINT ATOM_IDS = ATOM_IDS1 # Add the new restraint: ADD_RESTRAINT RESTRAINT_PARAMETERS = 3 1 3 3 4 2 0 3.141593 0.087 DELETE_RESTRAINT ATOM_IDS = ATOM_IDS2 ADD_RESTRAINT RESTRAINT_PARAMETERS = 3 1 3 3 4 2 0 0.0 0.087 RETURN END_SUBROUTINE 2.5.7 DELETE__RESTRAINT _ unselect restraint Options: ATOM_IDS = '' atom ids: atom:residue_id[:chain_id] Requirements: MODEL Description: This command scans the currently selected restraints to find all the restraints that operate on the specified atoms (Section 2.5.1) and then unselects them. The order of the atom names in ATOM_IDS does not matter: All restraints that contain all and only the specified atoms are unselected. This means that it is not possible to distinguish between the dihedral angle and improper dihedral angle restraints on the same four atoms. The command only unselects the restraints found. To completely remove all the unselected restraints from memory, use CONDENSE_RESTRAINTS. The DELETE_RESTRAINT command is useful in speci- fying cis-peptide bonds from a Top script. Example: See ADD_RESTRAINT command. 2.5. CALCULATION OF SPATIAL RESTRAINTS 83 2.5.8 REINDEX__RESTRAINTS _ renumber MODEL2 restraints for MODEL Requirements: restraints & MODEL & MODEL2 Description: This command renumbers atom indices in all restraints in memory. It is expected that the input re- straints refer to MODEL2; the re-indexed restraints will correspond to MODEL. Both MODEL and MODEL2 have to be in memory. Only those restraints that have all atoms in MODEL will be selected. You can remove the others by CONDENSE_RESTRAINTS. This command is useful when the old restraints have to be used while changing from one topology model to another. Example: # Example for: REINDEX_RESTRAINTS # This will reindex restraints obtained previously for a simpler topology so # that they will now apply to a more complicated topology. # Generate the model for the simpler topology (CA only in this case): READ_TOPOLOGY FILE = '$(LIB)/top_ca.lib' READ_PARAMETERS FILE = '$(LIB)/par_ca.lib' SET TOPOLOGY_MODEL = 7 READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ca', ; ALIGN_CODES = ALIGN_CODES '1fas-ca' GENERATE_TOPOLOGY SEQUENCE = '1fas-ca' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off WRITE_MODEL FILE = '1fas.ca' # Generate the restraints for the simpler topology: MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' WRITE_RESTRAINTS FILE = '1fas-ca.rsr' ENERGY # Generate the model for the more complicated topology: READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' SET TOPOLOGY_MODEL = 3 READ_MODEL FILE = '1fas' SET ADD_SEQUENCE = off SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' TRANSFER_XYZ WRITE_MODEL FILE = '1fas.ini' READ_MODEL2 FILE = '1fas.ca' REINDEX_RESTRAINTS WRITE_RESTRAINTS FILE = '1fas.rsr' ENERGY 84 CHAPTER 2. MODELLER COMMANDS 2.5.9 SPLINE__RESTRAINTS _ approximate restraints by splines Options: SPLINE_DX = 0.5 interval size for splining restraints SPLINE_MIN_POINTS = 5 have at least as many intervals in a spline SPLINE_RANGE = 4.0 range of the splines SPLINE_SELECT = 4 1 9 specification of the restraints to be splined: fo* *rm feature group Description: This command calculates and selects new restraints that are a spline approximation of the selected restraints of the specified type. It unselects the approximated restraints. The type of the approximated restraints is specified by SPLINE_SELECT and is defined by the mathematical form (Gaussian, etc ), feature type (distance, etc ), and physical restraint group (sidechain O1, etc ) (the first, third, and fourth integer numbers in the restraint specification). The restraint is approximated in a certain range only, determined differently for different mathematical forms. For example, the poly-Gaussian range is from m - SPLINE_RANGE x oem to M + SPLINE_RANGE x oeM , where m and M are the minimal and maximal means of the basis pdfs, and oem and oeM are their corresponding standard deviations. The spline points are distributed evenly over this range with an interval of SPLINE_DX. SPLINE_DX should be equal to the scale of the peaks of the restraint that you want to approximate reliably. The value of the restraint beyond the range is determined by linear extrapolation using the first derivatives at the bounds. If the x-range and SPLINE_DX are such that the number of spline points would be less than SPLINE_- MIN_POINTS, SPLINE_DX is decreased so that there are SPLINE_MIN_POINTS defining the `splined' restraint. Example: See MAKE_RESTRAINTS command. 2.5.10 READ__RESTRAINTS _ read spatial restraints Options: FILE = 'default' input restraints file DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') FILE_FORMAT = 'FORMATTED' file format: 'FORMATTED' _ 'UNFORMATTED' ADD_RESTRAINTS = off whether to add new restraints to existing * *re- straints Description: This command reads restraints, excluded atom pairs, and pseudo atom definitions from a file. An excluded atom pair specifies two atoms that are not to be tested during generation of the dynamic non- bonded pair list. There is one restraint entry per line. The two possible formats of the file, MODELLER and USER, are described in Section 2.5. The routine determines automatically which format is used. The new restraints are added to those that are already in memory if ADD_RESTRAINTS = on, otherwise they initiate the restraints list. All the new restraints are automatically selected. Example: See MAKE_RESTRAINTS command. 2.5.11 WRITE__RESTRAINTS _ write spatial restraints Options: FILE = 'default' partial or complete filename OUTPUT_DIRECTORY = '' output directory FILE_FORMAT = 'FORMATTED' file format: 'FORMATTED' _ 'UNFORMATTED' 2.5. CALCULATION OF SPATIAL RESTRAINTS 85 Description: This command writes the currently selected restraints to a file in the MODELLER format (see Section 2.5). If 'UNFORMATTED' format is selected the file is approximately one third of the 'FORMATTED' size. Both formats can be read with the READ_RESTRAINTS command. The output in the USER format is not yet implemented. Example: See MAKE_RESTRAINTS command. 86 CHAPTER 2. MODELLER COMMANDS 2.6 Optimization of the model This section describes commands for creating, reading and writing optimization schedule, and for calculating and optimizing the objective function. For technical background, see Section 6.2. 2.6.1 MAKE__SCHEDULE _ create optimization schedule Options: LIBRARY_SCHEDULE = 1 selects schedule from the $SCHED_LIB library SCHEDULE_SCALE = 1 1 1 1 1 1 1 1 1 1factors for physical restraint types in scaling t* *he 1 1 1 1 1 1 1 1 1 1schedule 1 1 1 1 1 1 1 1 1 1 1 Requirements: MODEL Output: N_SCHEDULE Description: This command constructs an optimization schedule for the variable target function method for the current MODEL. The template for construction of the schedule is the LIBRARY_SCHEDULE-th entry in library file $SCHED_LIB. The usual schedule for the variable target function part of optimization in comparative modeling is as follows. The residue range (PICK_RESTRAINTS and Section 2.5.2) is increased with increasingly larger steps until the protein length is reached. The scaling of homology-derived and bonded stereochemical restraints increases from a small value to 1 in the initial few steps to allow for imperfect starting geometries, especially those that result from RANDOMIZE_XYZ and long insertions or deletions. The soft-sphere overlap restraints are slowly introduced only in the last four steps of the variable target function method to save CPU time and increase the radius of convergence. In comparative modeling by the 'model' script in the default mode, the variable target function method is usually followed by simulated annealing with molecular dynamics. In this last stage, all homology-derived and stereochemical restraints are generally used with the scaling factors of 1. There are a number of variables defined in the 'modlib/__defs.top' script that can be used to influence the thoroughness of both the variable target function and molecular dynamics parts of the optimization (Chapter 3). The scaling factors for all physical restraint groups, in all schedule steps, are multiplied by the corresponding scalar in SCHEDULE_SCALE (1 by default). This is useful when template-derived fold restraints have to be weakened relative to some external restraints, so that the fold can actually reflect these external restraints, even when they are quite different from the template-derived restraints. This command is an alternative to the READ_SCHEDULE command. Use the WRITE_SCHEDULE command to find out what the calculated schedule is. The schedule file written by the `model' routine has an extension .sch. Example: # Example for: MAKE_SCHEDULE, WRITE_SCHEDULE, READ_SCHEDULE # This will create an VTFM optimization schedule for a model # and write it to a file. # MODEL has to be in memory for MAKE_SCHEDULE: READ_MODEL FILE = '1fas' MAKE_SCHEDULE LIBRARY_SCHEDULE = 1 # Write the schedule to a file: WRITE_SCHEDULE FILE = '1fas.sch' # Read it in just for fun: READ_SCHEDULE FILE = '1fas.sch' 2.6. OPTIMIZATION OF THE MODEL 87 2.6.2 READ__SCHEDULE _ read optimization schedule Options: FILE = 'default' partial or complete filename DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') SCHEDULE_SCALE = 1 1 1 1 1 1 1 1 1 1factors for physical restraint types in scaling t* *he 1 1 1 1 1 1 1 1 1 1schedule 1 1 1 1 1 1 1 1 1 1 1 Output: N_SCHEDULE Description: This command reads a text file that contains an optimization schedule for the variable target function method. Each line in the file contains in free format the parameters for a single step of the variable target function method. These parameters are: step index (not used by the program), optimization method, maximal difference in residue indices of atoms restrained by the selected restraints (PICK_RESTRAINTS and Section 2.5.2), and the scaling factors for all types of restraints. The smaller the scaling factor, the weaker the corresponding restraint. See MAKE_SCHEDULE for explanation of SCHEDULE_SCALE. This command also sets the Top variable N_SCHEDULE to the total number of the variable target function steps that were read in. Example: See MAKE_SCHEDULE command. 2.6.3 WRITE__SCHEDULE _ write optimization schedule Options: FILE = 'default' partial or complete filename OUTPUT_DIRECTORY = '' output directory Description: This command writes out the schedule for the variable target function method. This schedule file can then be read by the READ_SCHEDULE command. Example: See MAKE_SCHEDULE command. 88 CHAPTER 2. MODELLER COMMANDS 2.6.4 ENERGY _ evaluate MODEL given restraints Options: VIOL_REPORT_CUT = 4.5 4.5 4.5 4.5 4.5 cutoffs for reporting relative violations 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 999 999 999 999 4.5 4.5 4.5 4.5 4.5 4.5 999 4.5 4.5 4.5 4.5 4.5 4.5 4.5 VIOL_REPORT_CUT2 = 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 OUTPUT = 'LONG' 'SHORT' _ 'LONG' _ 'VERY_LONG' _ 'GRADIENT' _ 'SYMMETRY' ASGL_OUTPUT = off whether to write output for ASGL SCHEDULE_STEP = 1 schedule step for optimization TOPOLOGY_MODEL = 3 selects topology library: 1-9 RADII_FACTOR = 0.82 factor for van der Waals radii SPHERE_STDV = 0.05 standard deviation of soft-sphere repulsion DYNAMIC_SPHERE = on whether to use dynamic soft-sphere repulsion terms DYNAMIC_LENNARD = off whether to use dynamic Lennard-Jones energy terms DYNAMIC_COULOMB = off whether to use dynamic Coulomb energy terms DYNAMIC_MODELLER = off whether to use dynamic MODELLER non- bonded restraints LENNARD_JONES_SWITCH = 6.5 7.5 the range for Lennard-Jones interaction smooth- ing to 0 COULOMB_SWITCH = 6.5 7.5 the range for Coulomb interaction smoothing to 0 RELATIVE_DIELECTRIC = 1.0 relative dielectric constant CONTACT_SHELL = 4.0 distance cutoff for calculation of the non-bonded pairs list UPDATE_DYNAMIC = 0.39 when to update non-bonded pairs list NLOGN_USE = 15 number of residues at which to begin using the N Log N non-bonded pairs routine Output: MOLPDF Requirements: MODEL & restraints Description: The main purpose of this command is to compare spatial features of the current MODEL with the selected restraints in order to determine the violations of the molecular pdf. It lists variable amounts of information about the values of the basis, feature, and molecular pdf's for the current MODEL. All arguments that affect the value of the molecular pdf are also relevant for the ENERGY command. Most of the output goes to the log file. The output of the ENERGY command has to be examined carefully, at least at the end of the optimization, when the final model is produced. Additional output files, for the Asgl plotting program are created if ASGL_OUTPUT = on (undocumented). OUTPUT selects various kinds of output information: 2.6. OPTIMIZATION OF THE MODEL 89 o 'LONG' writes restraint violations one per line to the log file. o 'VERY_LONG' writes the most detailed examination of the selected basis and feature pdf's to the log file, using several lines of output for each restraint. o 'GRADIENT' writes the `force' gradients for the currently selected restraints to the isotropic temperature factors for each atom of the current MODEL. o 'SYMMETRY' writes a comparison of equivalent distances involved in the definition of the symmetry enforcing term to the log file. VIOL_REPORT_CUT is a vector with one real number for each physical restraint type. A restraint is reported when its `heavy relative violation' is larger than the corresponding cutoff. The heavy relative violation is calculated by finding the global minimum of a feature according to the restraint, taking the difference between the actual feature in the model and this global minimum, and then normalizing the difference by the standard deviation of the global minimum. The `minimal violation' of a restraint is defined as the difference from the local minimum closest to the value of the feature in the model (with the exception of the spline restraints; see next paragraph). VIOL_REPORT_CUT2 is similar to VIOL_REPORT_CUT, except that it contains cutoffs for restraint `energies', not heavy relative violations. The meaning of various other reported properties of the violated restraints is briefly described in the log file. Note that for multi-modal restraints that are described by cubic splines (by default, all multimodal homology- derived restraints), only one optimal value is defined, not the local and global minimum as for the multi-modal Gaussian restraints. As a result, the minimal violations and heaviest violations are the same. For interpreting the seriousness of violations, use the following rule of thumb: There should be at most a few small violations (e.g., 4 standard deviations) for all monomodal restraints. In comparative modeling, the monomodal restraints include the stereochemical restraints and distance restraints when only one homologous structure is used. For the multimodal restraints, there are usually many violations reported because the heaviest violations are used in deciding whether or not to report a violation. In comparative modeling, the multimodal restraints include the Oi restraints, (, ) binormal restraints and distance restraints when more than one template is used. See also Chapter 4, Question 23. See description of OPTIMIZE for the other variables. Example: # Example for: ENERGY # This will calculate the stereochemical energy (bonds, # angles, dihedrals, impropers) for a given model. READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' # Must patch disulfides here to calculate the non-bonded # energy properly. Also, when you use hydrogens, disulfides # must always be patched so that sulfhydril hydrogens are # removed from the model. PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '17' '39' PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '3' '22' PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '53' '59' PATCH RESIDUE_TYPE = DISU, RESIDUE_IDS = '41' '52' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off 90 CHAPTER 2. MODELLER COMMANDS MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' ENERGY DYNAMIC_SPHERE = on 2.6.5 ENERGY__PROFILE _ calculate energy profile of MODEL Options: SMOOTHING_WINDOW = 1 profiles are smoothed over 2*SW + 1 residues OUTPUT = 'LONG' 'ENERGY_PROFILE' _ 'RES_VIOL' NORMALIZE_PROF = off whether to normalize energy profiles or not all ENERGY options Requirements: MODEL & restraints Description: This command calculates residue energies or heavy relative violations, depending on OUTPUT, for all physical restraint types (there are NPHYCNS of them). Relative heavy violations (Table 2.2) are used because only relative violations of different features are comparable. In both cases, the residue sum is the sum over all restraints that have at least one atom in a given residue. The contribution of each restraint is counted exactly once for each residue, without any weighting. Restraints spanning more than one residue contribute equally to all of them. Thus, the sum of residue energies is generally larger than molecular pdf. The command also calculates the sum of the NPHYCNS contributions for each residue and writes all NPHYCNS+1 columns to a file suitable for plotting by Asgl . If NORMALIZE_PROF is on the energy profile value for each residue is normalized by the number of restraints of a given type that contribute to the `energy' of that residue. All the curves are smoothed by the running window averaging method if SMOOTHING_WINDOW is larger than 0: The window is centered on residue i and extends for SMOOTHING_WINDOW residues on each side. Thus, there are 2 x SMOOTHING_WINDOW + 1 residues taken into account for each i. The only exceptions are the two terminii, where a smaller number of residues are available for smoothing. The relative weight of residue j when calculating the smoothed value at residue i is (SMOOTHING_WINDOW - |j - i| + 1). The energy or the violations profile is written to the fourth column of the MODEL atomic records (atomic isotropic temperature factors for X-ray structures). Note that all the atoms in one residue get the same number. This output is useful for exploring the violations on a graphics terminal. Example: # Example for: ENERGY_PROFILE # Read libraries for calculating restraints: READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' # Have to skip this for now: # READ_ATOM_CLASSES ATOM_CLASSES_FILE = '$(LIB)/atmcls-168.lib' # READ_PARAMETERS FILE = '$(LIB)/par-sippl1.lib', ADD_PARAMETERS = on READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ALIGN_CODES = '1fas', ATOM_FILES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' # Generate MODEL topology GENERATE_TOPOLOGY 2.6. OPTIMIZATION OF THE MODEL 91 # Make sure the coordinates are fine and in the correct order: TRANSFER_XYZ # Get the remaining undefined coordinates from internal coordinates: BUILD_MODEL INITIALIZE_XYZ = off # Calculate CHARMM stereochemical restraints: MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' # Select several non-bonded restraint types: SET DYNAMIC_SPHERE = on SET DYNAMIC_LENNARD = on SET DYNAMIC_COULOMB = on SET DYNAMIC_MODELLER = on SET CONTACT_SHELL = 10.0 # Calculate the smoothed energy profiles: SET SMOOTHING_WINDOW = 3 ENERGY_PROFILE OUTPUT = 'ENERGY_PROFILE', FILE = 'profile.asgl' # Calculate solvent accessibility, useful for interpreting the energy profiles: WRITE_DATA FILE = 'profile', OUTPUT = 'PSA' # You can plot the profiles with ASGL, using script $MODROOT/scripts/profile.top # SYSTEM COMMAND = 'asgl profile' 2.6.6 OPTIMIZE _ optimize MODEL given restraints Options: OPTIMIZATION_METHOD = -999 type of optimization method: 1 _ 3 SCHEDULE_STEP = 1 schedule step for optimization TOPOLOGY_MODEL = 3 selects topology library: 1-9 RADII_FACTOR = 0.82 factor for van der Waals radii SPHERE_STDV = 0.05 standard deviation of soft-sphere repulsion DYNAMIC_SPHERE = on whether to use dynamic soft-sphere repulsion terms DYNAMIC_LENNARD = off whether to use dynamic Lennard-Jones energy terms DYNAMIC_COULOMB = off whether to use dynamic Coulomb energy terms DYNAMIC_MODELLER = off whether to use dynamic MODELLER non- bonded restraints LENNARD_JONES_SWITCH = 6.5 7.5 the range for Lennard-Jones interaction smooth- ing to 0 COULOMB_SWITCH = 6.5 7.5 the range for Coulomb interaction smoothing to 0 RELATIVE_DIELECTRIC = 1.0 relative dielectric constant DYNAMIC_FLAG = 0 CONTACT_SHELL = 4.0 distance cutoff for calculation of the non-bonded pairs list UPDATE_DYNAMIC = 0.39 when to update non-bonded pairs list NLOGN_USE = 15 number of residues at which to begin using the N Log N non-bonded pairs routine 92 CHAPTER 2. MODELLER COMMANDS TRACE_OUTPUT = 0 modulus for writing information about optimiza- tion iterations: 0 for nothing MAX_ITERATIONS = 200 maximal iterations in optimization OUTPUT = 'LONG' 'NO_REPORT' o For conjugate gradients: MIN_ATOM_SHIFTS = TYPEVALUES DEFAULT 'NO_REPORT' _ 'REPORT' o For molecular dynamics: MD_TIME_STEP = 4.0 time step for MD in fs INIT_VELOCITIES = on whether to initialize velocities before MD TEMPERATURE = 293.0 temperature for MD simulation in K EQUILIBRATE = 999999 equilibrate during MD every that many steps MD_RETURN = 'FINAL' return MODEL with 'MINIMAL' energy or 'FINAL' MODEL CAP_ATOM_SHIFT = 0.2 limit for atomic shifts in optimization RAND_SEED = -8123 random seed from -50000 to -2 STOP_ON_ERROR = 1 whether to stop on error Output: MOLPDF, MODELLER_STATUS Requirements: MODEL & restraints Description: This command performs a number of optimizing iterations using a selected optimization method (6.2). One call to OPTIMIZE corresponds to a single step of the variable target function method. The whole variable target function method is implemented by a Top script. The molecular pdf is optimized with respect to the selected coordinates of the current MODEL; the optimized coordinates are returned as the current MODEL. Some output may be generated during optimization; for example, a value of the molecular pdf, average and maximal atomic shifts are written to the current tracing file every TRACE_OUTPUT iterations of the optimizer if TRACE_OUTPUT is larger than 0 (see the SWITCH_TRACE command). In addition, a summary of the optimization results is written to the log file after optimization, unless OUTPUT contains string 'NO_REPORT'. OPTIMIZATION_METHOD = 1 selects a conjugate gradients optimization method. OPTIMIZATION_METHOD = 3 selects a molecular dynamics optimization at a fixed temperature. The conjugate gradients optimizer is a modified version of the Beale restart conjugate gradients method [Shanno & Phua, 1980, Shanno & Phua, 1982]. The molecular dynamics routine is the most basic version of the iterative solver of the Newton's equa- tions of motion. The integrator uses the Verlet algorithm [Verlet, 1967]. All atomic masses are set to that of carbon 12. A brief description of the algorithms is given in Section 6.2. SCHEDULE_STEP is the variable target function step. It selects some of the optimization parameters; it refers to the line in the schedule file which specifies (1) the optimization method (1=Conjugate Gradients, 3=Molec- ular Dynamics); (2) maximal number of residues that the restraints are allowed to span (Section 2.5.2); (3) the individual scaling factors for all the physical restraint types. OPTIMIZATION_METHOD overrides the schedule specification if it is within a defined range. CONTACT_SHELL defines the maximal distance between atoms that flags a non-bonded atom pair. Such pairs are stored in the list of non-bonded atom pairs. Only those non-bonded pairs that are sufficiently close to each other will result in an actual non-boned restraint. If undefined (-999), the default value is the maximum of the three possibilities: twice the radius of the largest atom multiplied by RADII_FACTOR (in the case of the all non-hydrogen atoms model, this is 3.2 A); LENNARD_JONES_SWITCH[2]; or COULOMB_SWITCH[2]. Only those values of the three possibilities are compared that have the corresponding DYNAMIC_SPHERE, DYNAMIC_LENNARD, or DYNAMIC_COULOMB set to on. The best value for CONTACT_SHELL must be found in combination with UPDATE_DYNAMIC (see also below). Good values are 4A for CONTACT_SHELL and 0.39A for UPDATE_DYNAMIC when no Lennard-Jones and Coulomb terms are used; if CONTACT_SHELL 2.6. OPTIMIZATION OF THE MODEL 93 is larger, there would be many pairs in the non-bonded pairs list which would slow down the evaluation of the molecular pdf. If it is too small, however, the increased frequency of the pair list recalculation may slow down the optimization. It is useful in some simulations to be able to set CONTACT_SHELL to something large (e.g., 8A) and UPDATE_DYNAMIC to 999999.9, so that the pairs list is prepared only at the beginning. However, you have to make sure that the potential energy is not invisibly pumped into the system by making contacts that are not on the list of non-bonded pairs (see below). UPDATE_DYNAMIC sets the cumulative maximal atomic shift that triggers recalculation of the list of atom- atom non-bonded pairs. It should be set in combination with CONTACT_SHELL. For soft-sphere overlap, to be absolutely sure that no unaccounted contacts occur, UPDATE_DYNAMIC has to be equal to (CON- TACT_SHELL - maximal_overlap_distance) / 2. Maximal_overlap_distance is equal to the diameter of the largest atom in the model; it is 3.2 A in the case of the all non-hydrogen atoms model. This distance is the CONTACT_SHELL value if a default is requested. Factor 2 comes from the fact that the moves of both atoms can reduce the distance between them. DYNAMIC_SPHERE has to be set to on for the automatic generation of the soft-sphere overlap restraints. Another necessary condition is that the scaled standard deviation of the soft-sphere overlap restraints is greater than zero. It is simpler not to pre-calculate any soft-sphere overlap restraints and to use the dynamically generated restraints alone, although this may be slower. Similarly, DYNAMIC_LENNARD, DYNAMIC_COULOMB and DYNAMIC_MODELLER determine whether the dynamic Lennard-Jones terms, electrostatic interactions, and Modeller non-bonded spline restraints are calculated during optimization. The initial atom radii (before scaling by RADII_FACTOR) depend on TOPOLOGY_MODEL which selects a column of radii for the specified topology model from the $RADII_LIB library file. RADII_FACTOR is the scaling factor for the atom radii as read from the library file. The scaled radii are used only for the calculation of violations of the soft-sphere overlap restraints. LENNARD_JONES_SWITCH is a real vector of two elements. It specifies rmin and rmax for the Lennard-Jones interaction (Eq. 6.62). The potential is smoothed down to zero between these two distances. COULOMB_SWITCH is a real vector of two elements. It specifies rmin and rmax for the electrostatic interaction (Eq. 6.59). The potential is smoothed down to zero between these two distances. RESIDUE_SPAN_RANGE is taken into account when calculating dynamic pairwise non-bonded restraints (RESIDUE_SPAN_SIGN is ignored here). The dynamic restraints include soft-sphere overlap, Lennard-Jones, electrostatic restraints, and general spline restraints. The first three types of restraints can also be gen- erated as static restraints by MAKE_RESTRAINTS. See MAKE_RESTRAINTS for description of RESIDUE_SPAN_RANGE. For the dynamic restraints, only the restraints with all atoms in the selected set of atoms are included. This means that the selected set has to be large enough to include atoms on the boundary of the selected set that will not change much as a result of optimization. This is needed so that new, unaccounted for, violations are not introduced as a result of overlap with neighboring atoms not in the selected set. If such a situation happens in the hot spot refinement, hot spot selection and refinement should be repeated. The automatically generated dynamic restraints are always deleted after a command that calculates them is finished (OPTIMIZE, ENERGY, PICK_HOT_ATOMS, ENERGY_PROFILE); you have to use MAKE_RESTRAINTS to calculate equivalent static restraints if you want to write the `dynamic' restraints to a file. MIN_ATOM_SHIFT is a convergence criterion for the conjugate gradients optimization. When the maximal atomic shift is less than the specified value, the optimization is finished regardless of the number of optimiza- tion cycles or function value and its change. MAX_ITERATIONS is used to prevent a waste of CPU time in the conjugate gradients optimization. When that many cycles are done, the optimization is finished regardless of the maximal atomic shift. Before calculating dynamic non-bonded restraints, Modeller determines which of the several routines is most appropriate and efficient for calculating the non-bonded atom pairs list. The user can influence this selection by specifying two variables: DYNAMIC_FLAG, which has an effect when only a subset of all atoms is selected by the PICK_ATOMS or PICK_HOT_ATOMS commands (set 1), and NLOGN_USE, which has an effect when all atoms are selected. If DYNAMIC_FLAG is 0 (default), the non-bonded pairs will contain only selected atoms (set 1). This means that the optimized atoms will not "feel" the rest of the protein through 94 CHAPTER 2. MODELLER COMMANDS the non-bonded terms at all. If DYNAMIC_FLAG = 1, only one of the atoms in the non-bonded pair has to be a selected atom. This means that the selected region feels the rest of the system through the non-bonded terms, at the expense of longer CPU times. When all atoms are selected, DYNAMIC_FLAG of course has no effect. However, in that case, NLOGN_USE is used to select either a straightforward O(n2) search or a cell-based algorithm which has n log n dependency of CPU time versus size n. The latter algorithm is used when the maximal difference in residue indices of the atoms in the current dynamic restraints is larger than NLOGN_USE or when the box size for this algorithm would have to be larger than 8A . The molecular dynamics optimizer pretends that the natural logarithm of the molecular pdf is energy in kcal/mole. MD_TIME is the time step in femtoseconds. TEMPERATURE is the temperature of the system in degrees Kelvin. MAX_ITERATIONS determines the number of MD steps. If MD_RETURN is 'FINAL' the last structure is returned as the MODEL. If MD_RETURN is 'MINIMAL' then the structure with the lowest value of the objective function on the whole trajectory is returned as the MODEL. Rescaling of velocities is done every EQUILIBRATION steps to match the specified temperature. Atomic shifts along one axis are limited by CAP_ATOM_SHIFT. This value should be smaller than UPDATE_DYNAMIC. If INIT_VELOCITIES = on, the velocity arrays are initialized, otherwise they are not. In that case, the final velocities from the previous run are used as the initial velocities for the current run. RAND_SEED is the seed for the random number generator. It has to be between -2 and -50000. Its value is changed after the return from the optimization routine. MOLPDF contains the value of the objective function at the end of optimization. MODELLER_STATUS is set to 1 if optimization is aborted because dynamic restraints could not be calculated as a result of a system being too large. If MODELLER_STATUS is equal or greater than STOP_ON_ERROR the execution is stopped. Otherwise the execution returns back to the Top routine, exiting all optimization routines immediately. The execution then continues as if nothing happened. It is up to the calling Top routine to ensure that sensible action is taken; e.g., skipping the rest of modeling for the model that resulted in an impossible function evaluation. This option is useful when calculating several independent models and you do not want one bad model to abort the whole calculation. A probable reason for an interrupted optimization is that it was far from convergence by the time the calculation of dynamic restraints was first requested. Two possible solutions are: (1) optimize more thoroughly (i.e. slowly) and (2) use a different contact pairs routine (SET NLOGN_USE = 9999). MODELLER_STATUS can be used in the Top routine to exit from an optimization of a hopeless model and to continue with another model from a different initial conformation. Example: # Example for: OPTIMIZE, SWITCH_TRACE # This will optimize stereochemistry of a given model, including # non-bonded contacts. READ_TOPOLOGY FILE = '$(LIB)/top_heav.lib' READ_PARAMETERS FILE = '$(LIB)/par.lib' READ_MODEL FILE = '1fas' SEQUENCE_TO_ALI ATOM_FILES = '1fas', ALIGN_CODES = '1fas' SEQUENCE_TO_ALI ADD_SEQUENCE = on, ATOM_FILES = ATOM_FILES '1fas.ini', ; ALIGN_CODES = ALIGN_CODES '1fas-ini' GENERATE_TOPOLOGY SEQUENCE = '1fas-ini' TRANSFER_XYZ BUILD_MODEL INITIALIZE_XYZ = off WRITE_MODEL FILE = '1fas.ini' # Generate the restraints: MAKE_RESTRAINTS RESTRAINT_TYPE = 'stereo' WRITE_RESTRAINTS FILE = '1fas.rsr' ENERGY DYNAMIC_SPHERE = on SWITCH_TRACE TRACE_OUTPUT = 1, FILE = '1fas.trc' 2.6. OPTIMIZATION OF THE MODEL 95 _______________________________________________________________________________________________________ __Column_______Description_____________________________________________________________________________ 1 iteration number within one step of the variable target function method 2 number of function evaluations within one step of VTFM 3 objective function value 4 average atomic shift 5 maximal atomic shift 6 proportional to the gradient 7 kinetic energy 8 temperature for molecular dynamics optimization __9____________total_energy_(kinetic_and_potential;_potential_=_objective_function_value)______________ Table_2.5:_Columns_in_an_optimization_trace_file.________ OPTIMIZE OPTIMIZATION_METHOD = 1, MAX_ITERATIONS = 20 OPTIMIZE OPTIMIZATION_METHOD = 3, TEMPERATURE = 300, MAX_ITERATIONS = 50 OPTIMIZE OPTIMIZATION_METHOD = 1, MAX_ITERATIONS = 20 ENERGY WRITE_MODEL FILE = '1fas.B' 2.6.7 SWITCH__TRACE _ open new optimization trace file Options: FILE = 'default' partial or complete filename DIRECTORY = '' directory list (e.g., 'dir1:dir2:dir3:./:/') TRACE_OUTPUT = 0 modulus for writing information about optimiza- tion iterations: 0 for nothing Description: This command specifies the file for the subsequent optimization tracing output. It is useful for separating tracing output for different models constructed in a single run of Modeller . The tracing output is only produced if TRACE_OUTPUT is larger than 0. The tracing file includes the iteration number, number of function evaluations, function value, average and maximal atomic shifts, the size of the gradient vector, kinetic energy (for molecular dynamics `optimization' only), temperature (MD only) and total energy. This is written out in every TRACE_OUTPUT-th cycle of whatever optimization method is used, starting with the state just before the optimization (iteration 0). When using the model script for comparative modeling, there is one .D file for each .B file with a model. The .D files contain information about the progress of optimization, from the beginning to the end. The most important column is column 3, which contains the value of the objective function, which is being optimized, as a function of the iteration step (every 10 steps, by default). Thus, the best model, according to Modeller , is the one that has the lowest number in the third column of the last line of its .D file. This value is also written out in the REMARK record of the PDB file containing the model and in the log file. Example: See OPTIMIZE command. 96 CHAPTER 2. MODELLER COMMANDS 2.6.8 DEBUG__FUNCTION _ test code self-consistency Options: DEBUG_FUNCTION_CUTOFF = 2.0 0.05 0.1 cutoffs for reporting differences between numeri- cal and analytical derivatives: absolute, relati* *ve errors, factor_for_indiv_rstrs DETAILED_DEBUGGING = off whether to evaluate energy and derivatives wrt each restraint all the ENERGY options Description: This command checks the self-consistency of the code for the objective function and its deriva- tives by calculating and comparing numeric and analytical derivatives. All the parameters influencing the evaluation of the molecular pdf are also relevant (see ENERGY). The derivative is reported if both the absolute difference and the fractional difference between the two kinds of evaluations are larger than DE- BUG_FUNCTION_CUTOFF[1] and DEBUG_FUNCTION_CUTOFF[2], respectively. When DETAILED_DEBUGGING is on, the analytic and numeric derivatives of each restraint with respect to atomic positions are also compared for the atoms `violated' by the whole molecular pdf. The absolute cutoff for writing out the discrepancies is scaled by DEBUG_FUNCTION_CUTOFF[3]; the relative cutoff remains the same as before. When Modeller is compiled in double precision, this test reports a smaller number of discrepancies. Example: # Example for: DEBUG_FUNCTION # This will use default MODELLER scripts to construct homology # restraints for 1fas. It will then use DEBUG_FUNCTION to test # the source code for the function and derivatives calculation # by comparing analytical and numerical first derivatives. # Some discrepancies will be reported but ignore them here. INCLUDE SET ALNFILE = 'debug_function.ali' SET SEQUENCE = '1fas' SET KNOWNS = '2ctx' '1nbt' SET SPLINE_ON_SITE = off CALL ROUTINE = 'model', EXIT_STAGE = 1 # To assign 0 weights to restraints whose numerical derivatives # code does not work (i.e., splines for angles and dihedrals): READ_SCHEDULE FILE = 'debug_function.sched' ENERGY DEBUG_FUNCTION DEBUG_FUNCTION_CUTOFF = 15.00 0.10 0.1, DETAILED_DEBUGGING = on Chapter 3 Modeller scripts This section describes some of the Modeller scripts found in the $MODINSTALL/bin/__*.top files. All these files and brief descriptions are listed in Table 3.1. __________________________________________________________________________________________________________________________ __Filename________________________Description_____________________________________________________________________________ __model.top the main script for comparative modeling without alignment __full_homol.top the main script for comparative modeling with alignment __loop.top loop modeling (in development!) __defs.top variable definitions for modeling by model __align_strs_seq.top aligning many structures with a sequence __getnames.top generating default filenames from protein codes __homcsr.top generating homology-derived restraints __spline.top generating splined restraints __cispeptide.top defining cis-peptides __default_patches.top making topology patches during modeling by model __special.top generating and reading special restraints/patches for modeling by model __generate_model.top generating initial models for modeling by model __single_model.top producing a single model by model __multiple_models.top used by model to generate an ensemble of models __refine.top molecular dynamics refinement for modeling by model __principal.top principal components clustering __asgl_mod.top plotting for clustering analysis (requires Asgl ) __complete.top generating missing atoms in a PDB file __fit.top superposing two structures, given an alignment ____mod.top_______________________the_main_include_file_including_all_other___*.top_files_________________________________ _____Table_3.1:_List_of_Modeller_______scripts.________ 3.1 Flowchart of comparative modeling by Modeller This section describes a flowchart of comparative modeling by Modeller , as implemented in the 'model' Top script. This script is also called by Quanta and InsightII . It can be used for a variety of modeling tasks, not only for comparative modeling. Input: script file, alignment file, PDB file(s) for template(s). Output: 97 98 CHAPTER 3. MODELLER SCRIPTS .log log file .ini initial conformation for optimization .rsr restraints file .sch VTFM schedule file .B9999???? PDB atom file(s) for the model(s) of the target sequence .V9999???? violation profiles for the model(s) The main Modeller routines used in each step are given in parentheses. 1. Read and check the alignment between the target sequence and the template structures (READ_ALIGNMENT and CHECK_ALIGNMENT). 2. Calculate restraints on the target from its alignment with the templates: (a) Generate molecular topology for the target sequence (GENERATE_TOPOLOGY). Disulfides in the target are assigned here from the equivalent disulfides in the templates (PATCH_DISULFIDES). Any user defined patches are also done here (as defined in Top routine `special_patches'). (b) Calculate coordinates for atoms that have equivalent atoms in the templates as an average over all templates (TRANSFER_XYZ) (alternatively, read the initial coordinates from a file). (c) Build the remaining unknown coordinates using internal coordinates from the Charmm topology library (BUILD_MODEL). (d) Write the initial model to a file with extension .ini (WRITE_MODEL). (e) Generate stereochemical, homology-derived, and special restraints (MAKE_RESTRAINTS) (alter- natively, skip this and assume the restraints file already exists): stereochemical RESTRAINT_TYPE = 'bond angle dihedral improper' mainchain dihedrals , RESTRAINT_TYPE = 'phi-psi_binormal' mainchain dihedral ! RESTRAINT_TYPE = 'omega_dihedral' sidechain dihedral O1 RESTRAINT_TYPE = 'chi1_dihedral' sidechain dihedral O2 RESTRAINT_TYPE = 'chi2_dihedral' sidechain dihedral O3 RESTRAINT_TYPE = 'chi3_dihedral' sidechain dihedral O4 RESTRAINT_TYPE = 'chi4_dihedral' mainchain CA-CA distance RESTRAINT_TYPE = 'distance' mainchain N-O distance RESTRAINT_TYPE = 'distance' sidechain-mainchain distance RESTRAINT_TYPE = 'distance' sidechain-sidechain distance RESTRAINT_TYPE = 'distance' block distance restraints RESTRAINT_TYPE = 'distance' user defined CALL ROUTINE = 'special_restraints' non-bonded pairs distance RESTRAINT_TYPE = 'sphere'; calculated on the fly (f) Write all restraints to a file with extension .rsr (WRITE_RESTRAINTS). 3. Calculate model(s) that satisfy the restraints as well as possible. For each model: (a) Generate the optimization schedule for the variable target function method (VTFM) (MAKE_SCHEDULE). (b) Read the initial model (usually from the .ini file from 2.d) (READ_MODEL). (c) Randomize the initial structure by adding a random number between DEVIATION angstroms to all atomic positions (RANDOMIZE_XYZ). (d) optimize the model: o Partially optimize the model by VTFM; Repeat the following steps as many times as specified by the optimization schedule: - Read all the restraints by `rd_restraints' (READ_RESTRAINTS). - Select only the restraints that operate on the atoms that are close enough in sequence, as specified by the current step of VTFM (PICK_RESTRAINTS). - Optimize the model by conjugate gradients, using only currently selected restraints (OPTI- MIZE). 3.2. SCRIPT FOR COMPARATIVE MODELING 99 o Refine the model by simulated annealing with molecular dynamics, if so selected: - do a short conjugate gradients optimization (OPTIMIZE). - increase temperature in several steps and do molecular dynamics optimization at each temper- ature (OPTIMIZE). - decrease temperature in several steps and do molecular dynamics optimization at each temper- ature (OPTIMIZE). - do a short conjugate gradients optimization (OPTIMIZE). (e) calculate the remaining restraint violations and write them out (ENERGY). (f) write out the final model to a file with extension .B9999???? where ???? indicates the model number (WRITE_MODEL). Also write out the violations profile. Also write superposed templates and models if so selected by FINAL_MALIGN3D = 1. 3.2 Script for comparative modeling The __model script implements the flowchart for comparative modeling by Modeller that is described in the previous Section 3.1. The script uses routines in several other files. It is structured so that it is easy to deal with many different situations, some of which are described in Chapter 4. The script is too long to be listed here. It can be found in $MODINSTALL/bin/__model.top. The default values of its arguments are defined in the __defs script file: # Define additional TOP variables needed for MODELLER: DEFINE_INTEGER VARIABLES = STARTING_MODEL ENDING_MODEL RSTRS_REFINED DEFINE_INTEGER VARIABLES = MAX_ITERATIONS_STORE WRITE_INTERMEDIATES DEFINE_INTEGER VARIABLES = IREPEAT REPEAT_OPTIMIZATION EXIT_STAGE DEFINE_INTEGER VARIABLES = CREATE_RESTRAINTS REFINE_HOT_ONLY DEFINE_INTEGER VARIABLES = MAX_VAR_ITERATIONS FINAL_MALIGN3D DEFINE_REAL VARIABLES = VIOL_REP_STORE MAX_MOLPDF DEFINE_REAL VARIABLES = MAX_CA-CA_DISTANCE MAX_N-O_DISTANCE DEFINE_REAL VARIABLES = MAX_SC-SC_DISTANCE MAX_SC-MC_DISTANCE DEFINE_STRING VARIABLES = MODEL MODEL2 CODE CODE2 ALNFILE MODEL2_FIT DEFINE_STRING VARIABLES = CSRFILE KNOWNS SCHFILE FINAL_MODEL DEFINE_STRING VARIABLES = GENERATE_METHOD RAND_METHOD MD_LEVEL DEFINE_STRING VARIABLES = SEGFILE PDB_EXT TOPLIB PARLIB FAMILY FIT_IN_REFINE DEFINE_STRING VARIABLES = ATOM_IDS1 ATOM_IDS2 OUTPUT2 DEFINE_STRING VARIABLES = LOOP_CSRFILE LOOP_INI_MODEL # default values for options in comparative modeling by MODELLER: SET STARTING_MODEL= 1 # the index of the first model; # determines how many models are calculated; SET ENDING_MODEL = 1 # the index of the last model; # determines how many models are calculated; SET DEVIATION = 4.0 # the amount of randomization of the initial model # must be > 0 if different final models are wanted; # # Do not forget to set WATER_IO, HETATM_IO, HYDROGEN_IO to ON if your model # includes WATER, HYDROGEN, and/or HETATM atoms. # # Additional flexibility is provided by re-defining the TOP routines # 'select_atoms', 'special_restraints', 'special_patches', and # 'rd_restraints'. # 100 CHAPTER 3. MODELLER SCRIPTS # Options that are not changed frequently: SET LIBRARY_SCHEDULE = 4 # 1 ... thorough var target func schedule # 4 ... faster var target func schedule SET MAX_VAR_ITERATIONS = 200 # maximal numb of iterations for the cycles # of the variable target function method SET MD_LEVEL = 'refine3' # what kind of optimization is done after # the variable target function method: # 'nothing' ... nothing; # 'refine1' ... thorough MD annealing; # 'refine2' ... fast MD annealing; # 'refine3' ... very fast MD annealing; SET REFINE_HOT_ONLY = 0 # 1 ... select and optimize only HOT atoms in refine; # 0 ... select and optimize all atoms in refine; # usually about half of the atoms are hot; in such cases, # 0 is faster for sequences longer than about 100 aa # because a faster non-bonded pairs algorithm can be used. SET RSTRS_REFINED = 1 # the types of restraints used to define # hot spots when MD_LEVEL <> 'nothing': # 0 ... stereochemistry only; # 1 ... stereochemistry and dihedral; # 2 ... all restraints; SET EXIT_STAGE = 0 # 0 ... no effect; # 1 ... exit without any optimization after # restraints and an initial model are # calculated (more efficient than # REPEAT_OPTIMIZATION=0); SET REPEAT_OPTIMIZATION = 1 # how many times the whole optimization # schedule (variable target function # method and refinement) is repeated # for each initial model; SET TRACE_OUTPUT = 10 # every which CG or MD cycle is reported; SET MAX_MOLPDF = 100E3 # abort optimization of the current model if # the molecular pdf is larger than this and # continue with the next model; SET TOPLIB = '$-LIB"/top_heav.lib' # topology library (all non-hydrogen atoms); SET TOPOLOGY_MODEL = 3 # corresponding topology model; SET PARLIB = '$-LIB"/par.lib' # parameters library; SET WRITE_INTERMEDIATES = 0 # 0 ... do not write out intermediate # atom files during optimization; # 1 ... write out intermediate atom files; SET FINAL_MALIGN3D = 0 # 0 ... do not do MALIGN3D and write # superposed templates & models # at the end of 'model' # 1 ... do that. SET GENERATE_METHOD= 'transfer_xyz' # how to build the initial model: # 'generate_xyz' from internal coordinates 3.2. SCRIPT FOR COMPARATIVE MODELING 101 # and write them to a file; # 'transfer_xyz' from template coordinates # and write them to a file; # 'read_xyz' read coordinates from # a file; SET RAND_METHOD = 'randomize_xyz' # a method to perturb the initial model: # 'randomize_dihedrals' ... uses DEVIATION # in degrees; # 'randomize_xyz' ... uses DEVIATION # in angstroms; # 'nothing' SET CREATE_RESTRAINTS = 1 # 0 ... read the restraints from a file; # 1 ... make the restraints and write them # to a file before reading them # for the optimization; in addition # to the default restraints, the TOP # routine 'special_restraints', # which may be re-defined in the # user TOP file, is called for any # user defined restraints that are # then also written to the same file. SET SPLINE_ON_SITE = on # on ... convert some restraints into splines # off ... no conversion SET OUTPUT_CONTROL = 1 0 0 1 # write real_output, notes, warnings, errors # Set maximal values for various distance restraints: SET MAX_CA-CA_DISTANCE = 14.0 SET MAX_N-O_DISTANCE = 11.0 SET MAX_SC-MC_DISTANCE = 5.5 SET MAX_SC-SC_DISTANCE = 5.0 # Routine 'user_after_single_model' can be redefined to do whatever at the end # of each model calculation (e.g. comparison with X-ray structure). # To write out reports on individual optimizations: SET OUTPUT = 'NO_REPORT SHORT' # MSI for QUANTA: # The alignment file format (I/O): SET ALIGNMENT_FORMAT = 'PIR' # The extension added to all *.Bxxxxnn filenames: SET PDB_EXT = ' ' # SET PDB_EXT = '.modlr.pdb' # to prevent SUPERPOSE in refine() if molecules are too small: SET FIT_IN_REFINE = 'NO_FIT' # To enable default filename generation if not explicitly defined: SET MODEL = 'undefined' SET CSRFILE = 'undefined' # Call this routine before calling 'model' if you want real fast optimization SUBROUTINE ROUTINE = 'very_fast' 102 CHAPTER 3. MODELLER SCRIPTS # SET STARTING_MODEL = 1 # SET ENDING_MODEL = 1 SET MAX_CA-CA_DISTANCE = 10.0 SET MAX_N-O_DISTANCE = 6.0 SET MAX_SC-MC_DISTANCE = 5.0 SET MAX_SC-SC_DISTANCE = 4.5 # Note that all models will be the same if you do not change RAND_METHOD SET RAND_METHOD = 'nothing' SET MAX_VAR_ITERATIONS = 50 SET LIBRARY_SCHEDULE = 7 SET MD_LEVEL = 'nothing' RETURN END_SUBROUTINE Chapter 4 Frequently asked questions (FAQ) 1. I do not care about the details of a model, I only want to calculate it very fast to get a quick idea about how it looks or to confirm that my alignment is plainly unreasonable in the structural sense. Note that only one model can be calculated in this way because the starting structure is not randomized before optimization. Only a very limited amount of the variable target function optimization with conjugate gradients is done. This is usually for a factor of 3 faster than the default procedure. For example, it takes about 17 seconds of CPU time to model a 60-residue protein on an SGI workstation with a R10000-195 processor. # Very fast homology modelling by the MODELLER TOP routine 'model'. INCLUDE # Include the predefined TOP routines SET ALNFILE = 'alignment.ali' # alignment filename SET KNOWNS = '5fd1' # codes of the templates SET SEQUENCE = '1fdx' # code of the target SET ATOM_FILES_DIRECTORY = './:../atom_files' # directories for input atom files SET STARTING_MODEL = 2 SET ENDING_MODEL = 2 CALL ROUTINE = 'very_fast' # prepare for extremely fast optimization CALL ROUTINE = 'model' # do homology modelling 2. How can I refine the model in successive steps? There is a pre-defined routine `select_atoms' which selects the atoms to be moved during optimization. By default, the routine selects all atoms, but you can redefine it to select any subset of atoms and then only those atoms will be refined. They will `feel' the presence of other atoms via all the static and possibly dynamic restraints that include both selected and un-selected atoms. For example, the script below would refine only atoms in residues 1 and 2 (file 'examples/tutorial-model/model-segment.top'). The difference between this script and the one in the question about loop modeling is that here the selected regions are optimized with the default optimization protocol and the default restraints, which generally include template-derived restraints. In contrast, the loop modeling routine does not use template-dependent restraints, but does a much more thorough optimization. # Homology modelling by the MODELLER TOP routine 'model'. # Demonstrates how to refine only a part of the model. # # You may want to use the more exhaustive "loop" modeling routines instead. 103 104 CHAPTER 4. FREQUENTLY ASKED QUESTIONS (FAQ) # INCLUDE # Include the predefined TOP routines SET ALNFILE = 'alignment.ali' # alignment filename SET KNOWNS = '5fd1' # codes of the templates SET SEQUENCE = '1fdx' # code of the target SET ATOM_FILES_DIRECTORY = './:../atom_files' # directories for input atom files SET STARTING_MODEL= 3 # index of the first model SET ENDING_MODEL = 3 # index of the last model # (determines how many models to calculate) SET DYNAMIC_FLAG = 0 # selected atoms do not feel the neighbourhood CALL ROUTINE = 'model' # do homology modelling SUBROUTINE ROUTINE = 'select_atoms' PICK_ATOMS SELECTION_SEGMENT='1:' '2:', SELECTION_SEARCH='segment', ; PICK_ATOMS_SET=1, RES_TYPES='all', ATOM_TYPES='all', ; SELECTION_FROM='all', SELECTION_STATUS='initialize', SELECTION_STEP=1 RETURN END_SUBROUTINE 3. I want to model one or more loops very thoroughly (meaning spending a lot of CPU time, not necessarily modeling more accurately). Note that loops and insertions are already modeled by the default modeling routine, so you do not have to do anything special to get a model for the insertions. However, if you really want to focus on loops, you can use a preliminary version of the new loop modeling routine that will appear in Modeller -5. The selected regions are optimized independently many times by a thorough molecular dynamics/simulated annealing procedure, using sequence-dependent restraints only, no templates. These restraints currently include the Charmm - derived stereochemical and non-bonded restraints, as well as statistical preferences of the different residue types for the different regions of the Ramachandran plot and for the different sidechain rotamers. Note that this procedure is not thoroughly evaluated and is likely to change in the near future. # Homology modelling by the MODELLER TOP routine 'model'. # Demonstrates how to refine only a part of the model. # # The difference with model-segment is that the loop is # refined on the basis of sequence alone, in the context # of the rest of the structure. # INCLUDE # Include the predefined TOP routines SET SEQUENCE = '1fdx' # code of the target SET MODEL = '1fdx.B99990001' # initial model of the target SET ATOM_FILES_DIRECTORY = './:../atom_files' # directories for input atom files SET STARTING_MODEL= 20 # index of the first loop model SET ENDING_MODEL = 23 # index of the last loop model # (determines how many models to calculate) SET MD_LEVEL = 'refine1' # the loop refinement method SET RAND_SEED = -34871 CALL ROUTINE = 'loop' # Cluster the models: 105 CALL ROUTINE = 'cluster', ID1 = STARTING_MODEL, ID2 = ENDING_MODEL # This routine picks model residues that need to be refined (necessary): SUBROUTINE ROUTINE = 'select_atoms' # Uncomment if you also want to optimize the loop environment: # SET SELECTION_SEARCH = 'SPHERE_SEGMENT', SPHERE_RADIUS = 6 # 4 residue insertion (1st loop): PICK_ATOMS SELECTION_SEGMENT = '19:' '28:', SELECTION_STATUS = 'initialize' # 2 residue insertion (2nd loop): PICK_ATOMS SELECTION_SEGMENT = '46:' '55:', SELECTION_STATUS = 'add' RETURN END_SUBROUTINE # This routine adds any special restraints (optional): # # SUBROUTINE ROUTINE = 'special_restraints' # MAKE_RESTRAINTS RESTRAINT_TYPE = 'ALPHA', RESIDUE_IDS = '46:' '55:' # RETURN # END_SUBROUTINE 4. I want to build a model of a chimeric protein based on two known structures. Alternatively, I want to build a multi-domain protein model using templates corresponding only to the indi- vidual domains. This can be accomplished using the standard modeling routine. The alignment file should be as follows when the chimera is a combination of proteins A and B: proteinA aaaaaaaaaaaaaaaaaaaaaaaaaaaa---------------------------------- proteinB ----------------------------bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb chimera aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb In the PIR format the alignment file is: >P1;proteinA structureX:proteinA aaaaaaaaaaaaaaaaaaaaaaaaaaaa----------------------------------* >P1;proteinB structureX:proteinB ----------------------------bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb* >P1;chimera sequence:chimera aaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb* If no additional information is available about the relative orientation of the two domains the resulting model will probably have an incorrect relative orientation of the two domains when the overlap between A and B is non-existing or short. 5. I don't want to use one region of a template for construction of my model. The easiest way to achieve this is to not align that region of the template with the target sequence. If region "bbbbbbbbb" of the template should not be used as a template for region "eeeee" of the target sequence the alignment should be like this: 106 CHAPTER 4. FREQUENTLY ASKED QUESTIONS (FAQ) template aaaaaaaaaaaaaaaaaaaaaaaa-----bbbbbbbbcccccccccccccccccccccccccccccc target ddddddddddddddddddddddddeeeee--------ffffffffffffffffffffffffffffff The effect of this alignment is that no homology-derived restraints will be produced for region "eeeee". 6. I want to define (additional) disulfide bonds in the target sequence because no equivalent disulfide bonds exist in any of the templates (in which case PATCH_DISULFIDES cannot define them automatically). # This is as usual: INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' CALL ROUTINE = 'model' STOP # Redefine the special_patches routine to include the additional disulfides # (this routine is empty by default): SUBROUTINE ROUTINE = 'special_patches' # A disulfide between residues 1 and 85 in chain A: PATCH RESIDUE_TYPE = 'DISU', RESIDUE_IDS = '1:A' '85:A' # A disulfide between residues 41 and 45 in chain B: PATCH RESIDUE_TYPE = 'DISU', RESIDUE_IDS = '41:B' '45:B' RETURN END_SUBROUTINE 7. I want to explicitly force certain Pro residues to the cis ! conformation. Note that Modeller should usually be allowed to handle this automatically via the omega dihedral angle restraints, which are calculated by default. # This is as usual: INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' CALL ROUTINE = 'model' STOP # Redefine the special_patches routine to force Pro to cis conformation: # (this routine is empty by default): SUBROUTINE ROUTINE = 'special_restraints' CALL ROUTINE = 'cispeptide', ATOM_IDS1 = 'O:4' 'C:4' 'N:5' 'CA:5', ; ATOM_IDS2 = 'CA:4' 'C:4' 'N:5' 'CA:5' RETURN END_SUBROUTINE 8. How can I select/remove/add a set of restraints? Restraints can be read from a file by READ_RESTRAINTS, calculated by MAKE_RESTRAINTS, or added `manually' by ADD_RESTRAINT. PICK_RESTRAINTS picks those restraints for objective function calculation that restrain the selected atoms only, as specified in the selected atoms set 1. Initially, all atoms are selected; this can be changed by the PICK_ATOMS command. MAKE_RESTRAINTS command for some restraint types (e.g., distance) constructs restraints of the selected type between the atoms in the selected atoms sets 2 and 3. Script 'scripts/__homcsr.top' can be studied for examples of the use of the PICK_ATOMS command when generating restraints by MAKE_RESTRAINTS. There are also commands for adding and deleting single restraints, ADD_RESTRAINT and DELETE_RESTRAINT, respectively. If you do CONDENSE_RESTRAINTS, the unselected restraints will be deleted. This is useful for getting rid of the unwanted restraints completely. 9. I want to add my own restraints for optimization of the model. You can read your restraints whenever the default restraints are read. 1* *07 INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' CALL ROUTINE = 'model' STOP # Redefine the rd_restraints routine: SUBROUTINE ROUTINE = 'rd_restraints' # This is the default homology-derived restraints: READ_RESTRAINTS FILE = CSRFILE, ADD_RESTRAINTS = off # This is two additional user provided files: READ_RESTRAINTS FILE = 'my_rsrs1.rsr', ADD_RESTRAINTS = on READ_RESTRAINTS FILE = 'my_rsrs2.rsr', ADD_RESTRAINTS = on SET ADD_RESTRAINTS = off RETURN END_SUBROUTINE 10. I want to add my own restraints to the file with the automatically derived homology restraints, immediately after the default calculation of the homology-derived restraints. This is achieved by redefining the special_restraints routine, which is empty by default. INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' CALL ROUTINE = 'model' # Redefine the special_restraints routine: SUBROUTINE ROUTINE = 'special_restraints' # Add some restraints from a file to existing homology-derived restraints: READ_RESTRAINTS FILE = 'my_rsrs1.rsr', ADD_RESTRAINTS = on # Restrain the specified CA-CA distance to 10 angstroms (st.dev.=0.1). # Use a harmonic potential and X-Y distance group. SET RESTRAINED_ATOMS = 'CA' 'CA', RESIDUE_IDS = '35:A' '40:C' ADD_RESTRAINT RESTRAINT_PARAMETERS = 3 1 1 27 2 2 0 10.0 0.1 SET ADD_RESTRAINTS = off RETURN END_SUBROUTINE 11. I have my own restraints file to be used exclusively for optimization by the default comparative modeling routine. INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' SET CSRFILE = 'targ1.rsr', CREATE_RESTRAINTS = 0 CALL ROUTINE = 'model' 12. I have my own initial structure to be used for optimization by the default comparative modeling routine. INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' # Specify the initial structure filename, and tell the program to read the initial file, not construct it from the templates: SET MODEL = 'targ1.ini', BUILD_METHOD = 'read_xyz' CALL ROUTINE = 'model' 13. What are the different refinement levels really doing? There are two different optimization approaches available within Modeller : variable target function method (VTFM) with conjugate gradients (CG) [Sali & Blundell, 1993] and molecular dynamics (MD) with simulated 108 CHAPTER 4. FREQUENTLY ASKED QUESTIONS (FAQ) annealing (SA) [Sali & Blundell, 1993]. They can both be done to a different degree (with more or less cycles of CG and MD, faster or slower schedule for VTFM and SA). The exact details are best obtained from the scripts themselves because a detailed description would probably be longer than the scripts. For example, the Quanta and InsightII implementations of Modeller have these three levels of optimization: no optimization (only copying coordinates from templates and building the undefined atoms using internal geometry information from the RTF entries); only VTFM with CG; also MD with SA. Most of the time (70%) is spent on the MD&SA part. Our experience is that when MD&SA are used, if there are violations in the best of the 10 models, they probably come from an alignment error, not an optimizer failure. 14. I want to change the default optimization schedule. See file scripts/__defs.top for the variables that could be changed and for their possible values. INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' # Very thorough VTFM optimization: SET LIBRARY_SCHEDULE = 1, MAX_VAR_ITERATIONS = 300 # Very thorough MD optimization: SET MD_LEVEL = 'refine1' # Repeat the whole cycle 3-times and do not stop unless obj.func. > 1E6 SET REPEAT_OPTIMIZATION = 3, MAX_MOLPDF = 1E6 CALL ROUTINE = 'model' 15. I want to build an all hydrogen atom model with water molecules and other non-protein atoms (atoms in the HETATM records in the PDB file). INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' SET MODEL_TOPOLOGY = 1, HYDROGEN_IO = on, HETATM_IO = on, WATER_IO = on SET TOPOLOGY_LIBRARY = $(LIB)/top.lib SET PARAMETERS_LIBRARY = $(LIB)/par.lib CALL ROUTINE = 'model' 16. How do I build a model with water molecules or residues that do not have an entry in the topology and/or parameter files? 17. FAQ:block Water molecules are indicated by 'w' in the alignment file and a special block residue ('BLK') that does not have entries in the residue topology and parameter libraries is indicated by '.' See Section 2.2.1 for information about block residues. INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' SET HETATM_IO = on, WATER_IO = on CALL ROUTINE = 'model' The alignment file: >P1;templ1 structureX:templ1:1::10:: FAYVI/.wwww* >P1;targ1 sequence:targ1:1::8:: -GWIV/.ww-w* * * 109 18. How do I define my own residue types, such as D-amino acids, special ligands, and unnatural amino-acids? This is a painful area in all molecular modeling programs. However, Charmm and X-PLOR provide a rea- sonably straightforward solution via the residue topology and parameter libraries. Modeller uses Charmm topology and parameter library format and also extends the options by allowing for a generic `BLK' residue type (Section 2.2.1). This BLK residue type circumvents the need for editing any library files, but it is not always possible to use it. Due to its conformational rigidity, it is also not as accurate as a normal residue type. In order to define a new residue type in the Modeller libraries, you have to follow the series of steps described below. As an example, we will define the ALA residue without any hydrogen atoms. For more information, please see the Charmm manual. (a) Define the new residue entry in the residue topology file (RTF), say top_heav.lib. RESI ALA 0.00000 ATOM N NH1 -0.29792 ATOM CA CT1 0.09563 ATOM CB CT3 -0.17115 ATOM C C 0.69672 ATOM O O -0.32328 BOND CB CA N CA O C C CA C +N IMPR C CA +N O CA N C CB IC -C N CA C 1.3551 126.4900 180.0000 114.4400 1.5390 IC N CA C +N 1.4592 114.4400 180.0000 116.8400 1.3558 IC +N CA *C O 1.3558 116.8400 180.0000 122.5200 1.2297 IC CA C +N +CA 1.5390 116.8400 180.0000 126.7700 1.4613 IC N C *CA CB 1.4592 114.4400 123.2300 111.0900 1.5461 IC N CA C O 1.4300 107.0000 0.0000 122.5200 1.2297 PATC FIRS NTER LAST CTER You can obtain an initial approximation to this entry by defining the new residue type using the residue type editor in Quanta and then writing it out to a file. The RESI record specifies the Charmm residue name, which can be up to four characters long and is usually the same as the PDB residue name (exceptions are the potentially charged residues where the different charge states correspond to different Charmm residue types). The number gives the total residue charge. The ATOM records specify the IUPAC (i.e., PDB) atom names and the Charmm atom types for all the atoms in the residue.1 The number at the end of each ATOM record gives the partial atomic charge. The BOND records specify all the covalent bonds between the atoms in the residue (e.g., there are bonds CB-CA, N-CA, O-C, etc.). In addition, symbol `+' is used to indicate the bonds to the subsequent residue in the chain (e.g., C - +N). The covalent angles and dihedral angles are calculated automatically from the list of chemical bonds. The IMPR records specify the improper dihedral angles, generally used to restrain the planarity of various groups (e.g., peptide bonds and sidechain rings). See also below. The IC (internal coordinate) records are used for constructing the initial Cartesian coordinates of a residue. An entry IC a b c d dab ffabc abcd ffbcd dcd specifies distances d, angles ff, and either dihedral angles or improper dihedral angles between atoms a, b, c and d, given by their IUPAC names. The improper dihedral angle is specified when the third atom, c, is preceded by a star, `*'. As before, the `-' and `+' pre-fixes for the atom names select the corresponding atom from the preceding and subsequent residues, respectively. The distances are in angstroms, angles in degrees. The distinction between the dihedral angles and improper dihedral angles is unfortunate 1___________________________________________________________ There are several small differences between the `Charmm ' IUPAC definitions here and those actually used by PDB (e.g., Ile CD1 in PDB is CD in Charmm , Leu CD1 and CD2 atoms are swapped, and the PDB carboxy terminal atoms O and OXT are OT1 and OT2 in Charmm , respectively). These differences are handled internally by Modeller . 110 CHAPTER 4. FREQUENTLY ASKED QUESTIONS (FAQ) since they are the same mathematically, except that by convention when using the equations, the order of the atoms for a dihedral angle is abcd and for an improper dihedral angle it is acbd. The PATC record specifies the default patching residue type when the current residue type is the first or the last residue in a chain. (b) You have to make sure that all the Charmm atom types of the new residue type occur in the MASS records at the beginning of the topology library: Add your entry at the end of the MASS list if nec- essary. If you added any new Charmm atom types, you also have to add them to the radii libraries, 'modlib/radii.lib' and 'modlib/radii14.lib'. These libraries list the atomic radii for the different topology models, for the long range and 1-4 non-bonded soft-sphere terms, respectively. The full names of the files that are used during calculation are given by the environment variables $RADII_LIB and $RADII14_LIB. (c) Optionally, you can add the residue entry to the library of Modeller topology models, modlib/models.lib. The runtime version of this library is specified by the environment variable $MODELS_LIB. This library specifies which subsets of atoms in the residue are used for each of the possible topologies. Currently, there are 9 topologies selected by MODEL_TOPOLOGY (3 is default): 1 ALLH all atoms 2 POL polar hydrogens only 3 HEAV non-hydrogen atoms only 4 MCCB non-hydrogen mainchain (N, C, CA, O) and CB atoms 5 MNCH non-hydrogen mainchain atoms only 6 MCWO non-hydrogen mainchain atoms without carbonyl O 7 CA CA atoms only 8 MNSS non-hydrogen mainchain atoms and disulfide bonds 9 CA3H reduced model with a small number of sidechain interaction centers The Ala entry is: # ALLH POLH HEAV MCCB MNCH MCWO CA MNSS CA3H * RESI ALA ATOM NH1 NH1 NH1 NH1 NH1 NH1 #### NH1 #### ATOM H HN #### #### #### #### #### #### #### ATOM CT1 CT1 CT1 CT1 CT1 CT1 CT1 CT1 CAH ATOM HB #### #### #### #### #### #### #### CH3E ATOM CT3 CT3 CT3 CT3 #### #### #### #### #### ATOM HA #### #### #### #### #### #### #### #### ATOM HA #### #### #### #### #### #### #### #### ATOM HA #### #### #### #### #### #### #### #### ATOM C C C C C C #### C #### ATOM O O O O O #### #### O #### The residue entries in this library are separated by stars. The `####' string indicates a missing atom. The atom names for the present atoms are arbitrary. The order of the atoms must be the same as in the Charmm residue topology library. If a residue type does not have an entry in this library, all atoms are used for all topologies. (d) You have to add the new residue type to the residue type library, modlib/restyp.lib. The execution version of this file is specified by the environment variable $RESTYP_LIB. For the ALA residue, 1 _ ALA _ A _ ALA _ alanine You would generally add the new residue type at the end of the file. There are 5 fields in each line, separated by the `_' characters. The first field is an integer index corresponding to the integer residue type. The standard residue types have their indices smaller than 24. These are also the indices corre- sponding to the residue-residue substitution matrices. The second field contains the list of equivalent PDB or IUPAC 3-character residue names, used in the PDB files. A list rather than a single name is allowed because PDB can unfortunately use different names for the same residue type (e.g., water can 1* *11 be HOH, WAT, etc.). The third field gives a single character code for the residue type, which is used in the alignment file. This does not have to be unique, but if it is not unique you cannot use it in the alignment file. The fourth field gives the four-character Charmm residue name, as specified in the RESI record of the topology library. The last field contains an optional comment. Every residue in the Charmm topology file has to have an entry in the $RESTYP_LIB library, but not every residue entry in the $RESTYP_LIB library needs an entry in the residue topology file. When you are adding a new residue type, you have to hope that the maximal number of residue types is not over-reached. If it is, a fatal error is reported at the beginning of the execution. In response, you could delete some of the un-needed existing residue types in the $RESTYP_LIB file, rather than re-compile the program with larger array sizes. (e) In general, when you add a new residue type, you also add new chemical bonds, angles, dihedral angles, improper dihedral angles, and non-bonded interactions, new in the sense that a unique combination of Charmm atoms types is involved whose interaction parameters are not yet specified in the parameter library (see also Section 2.2.1). In such a case, you will get a number of warning and/or error messages when you generate the stereochemical restraints by the MAKE_RESTRAINTS command. These messages can sometimes be ignored because Modeller will guess the values for the missing parameters from the current Cartesian coordinates of the model. When this is not accurate enough or if the necessary coordinates are undefined you have to specify the parameters explicitly in the parameter library. Search for BOND, ANGL, DIHE, and IMPR sections in the parameters library file and use the existing entries to guess your new entries. Note that you can use dummy atom types `X' to create general dihedral (i.e., X A A X) and improper dihedral angle (i.e., A X X A) entries, where A stands for any of the real CHARMM atom types. If you use non-bonded Lennard-Jones terms, you also have to add a NONB entry for each new atom type. If you use the default soft-sphere non-bonded restraints, you have already taken care of it by adding the new atom types to the $RADII_LIB and $RADII_LIB libraries. 19. How do I define my own patching residue types? This is even messier than defining a new residue type. As an example, we will define the patching residue for establishing a disulfide bond between two CYS residues. PRES DISU -0.36 ! Patch for disulfides. Patch must be 1-CYS and 2-CYS. ATOM 1CB CT2 -0.10 ! ATOM 1SG SM -0.08 ! 2SG--2CB-- ATOM 2SG SM -0.08 ! / ATOM 2CB CT2 -0.10 ! -1CB--1SG DELETE ATOM 1HG1 DELETE ATOM 2HG1 BOND 1SG 2SG IC 1CA 1CB 1SG 2SG 0.0000 0.0000 180.0000 0.0000 0.0000 IC 1CB 1SG 2SG 2CB 0.0000 0.0000 90.0000 0.0000 0.0000 IC 1SG 2SG 2CB 2CA 0.0000 0.0000 180.0000 0.0000 0.0000 The PRES record specifies the Charmm patching residue type (up to four characters). As for the normal RESI residue types, patching residue types also have to be defined in the residue type library, modlib/restyp.lib. The ATOM records have the same meaning as for the RESI residue types described above. The extension is that the IUPAC atom names (listed first) must be pre-fixed by the index of the residue that is patched. In this example, there are two CYS residues that are patched, thus the prefixes 1 and 2. When using the PATCH command, the order of the patched residues specified by RESIDUE_IDS must correspond to these indices (this is only important when the patch is not symmetric, unlike the 'DISU' patch in this example). DELETE records specify the atoms to be deleted, the two hydrogens bonded to the two sulphurs in this case. The BOND and IC (internal coordinate) records are the same as those for the RESI residues, except that the atom names are prefixed with the patched residue indices. 20. Is it possible to restrain secondary structure in the target sequence? 112 CHAPTER 4. FREQUENTLY ASKED QUESTIONS (FAQ) Yes. There are 'ALPHA', 'STRAND' and 'SHEET' restraint types that the MAKE_RESTRAINTS com- mand can generate. One specifies the segment which is then restrained to the specified secondary structure conformation. For example, # This is as usual: INCLUDE SET ALIGNMENT_FILE = 'align1.ali', KNOWNS='templ1', SEQUENCE='targ1' CALL ROUTINE = 'model' STOP # Redefine the special_restraints routine to include the secondary # structure restraints (this routine is empty by default): SUBROUTINE ROUTINE = 'special_restraints' SET ADD_RESTRAINTS = on # An alpha-helix: MAKE_RESTRAINTS RESTRAINT_TYPE = 'alpha', RESIDUE_IDS = '20' '30' # SET KEEP_DUPL_RESTR = 'new' # Two strands: MAKE_RESTRAINTS RESTRAINT_TYPE = 'STRAND', RESIDUE_IDS = '1' '6' MAKE_RESTRAINTS RESTRAINT_TYPE = 'STRAND', RESIDUE_IDS = '9' '14' # An anti-parallel sheet: MAKE_RESTRAINTS RESTRAINT_TYPE = 'SHEET', ATOM_IDS = 'N:1' 'O:14', SHEET_H-BONDS = -5 RETURN END_SUBROUTINE 21. I want to patch the N-terminal or (C-terminal) residue (e.g. , to model acetylation properly), but the PATCH command does not work. This is probably because the N-terminus is patched by default with the NTER patching residue (corresponding to -NH3+ ) and a patched residue may not be patched again. The solution is to turn the default patching off by SET PATCH_DEFAULT = off before the GENERATE_TOPOLOGY command is called. 22. Is it possible to use templates with the coordinates for C ffatoms only? Yes. You do not have to do anything special. 23. How do I analyze the output log file? First, check for the error messages by searching for string _E>'. These messages can only rarely be ignored. Next, check for the warning messages by searching for string _W>'. These messages can almost always be ignored. If everything is OK so far, the most important part of the log file is the output of the ENERGY command for each model. This is where the violations of restraints are listed. When there are too many too violated restraints, more optimization or a different alignment is needed. What is too many and too much? It depends on the restraint type and is best learned by doing ENERGY on an X-ray structure or a good model to get a feel for it. You may also want to look at the output of command CHECK_ALIGNMENT, which should be self-explanatory. I usually ignore the other parts of the log file. 24. How do I prevent "knots" in the final models? The best way to prevent knots is to start with a starting structure that is as close to the desired final model as possible. Other than that, the only solution at this point is to calculate independently many models and hope that in some runs there won't be knots. Knots usually occur when one or more neighboring long insertions (i.e., longer than 15 residues) are modeled from scratch. The reason is that an insertion is build from a randomized distorted structure that is located approximately between the two core anchoring regions. Under such conditions, it is easy for the optimizer to `fall' into a knot and then not be able to recover from it. Sometimes knots result from an incorrect alignment, especially when more than one template is used. When the alignment is correct, knots are a result of optimization not being good enough. However, making optimization more thorough by increasing the CPU time would not be worth it on the average as knots occur relatively infrequently. The excluded volume restraints are already included in the standard comparative modeling routine. Chapter 5 Top , Modeller scripting language Top is an interpreter of a scripting language specialized for certain areas. Its use includes programs Modeller and Asgl . Its syntax resembles that of Fortran . 5.1 The source file Each Top program or include file is stored in a file named 'program.top'. The .top extension is mandatory. The Top program consists of a series of commands. The order of commands is important. An example of the Top program that writes integers 1 to 10 to the output file is: # Define a variable: DEFINE_INTEGER VARIABLES = IVAR # Open a file for appending OPEN IO_UNIT = 21, OBJECTS_FILE = 'output.file', FILE_ACCESS = 'APPEND' # Loop from 1 to 10: DO IVAR = 1, 10, 1 # Append IVAR to the output file: WRITE IO_UNIT = 21, OBJECTS = IVAR END_DO # Close a file CLOSE IO_UNIT = 11 # Exit: STOP There can be at most one command per line. Each command or line can be at most LENACT (512) characters long. The command can extend over several lines if a continuation character `;' is used to indicate the end of the current line. Everything on that line after the continuation character is ignored. A comment character `#' can be used anywhere on the line to ignore everything that occurs after the comment character. Blank lines are allowed. They are ignored. TAB characters are replaced by blank characters. Top converts all commands to upper case, except for the string constants that are quoted in single quotes '. Thus, Top is case insensitive, except for the quoted strings. There are two groups of commands: flow control commands and commands that perform certain tasks. The next two sections describe the flow control commands and those `performing' commands that are an integral part 113 114 CHAPTER 5. TOP , MODELLER SCRIPTING LANGUAGE of Top . There are also additional commands specific to each application of Top , such as Modeller and Asgl , which are described elsewhere. The usual Unix conventions are used for typesetting the rules. Table 5.1 explains the shorthand used to describe different variables and constants: an integer variable or constant a real variable or constant a string variable or constant a logical variable or constant prefix for a variable prefix for a constant _ _ _ _ _ _ _ _ a vector of any length with elements a vector of N elements ___Table_5.1:_List_of_variable_types_in_Top___.________ All the variables are formally vectors. When a variable is referred to in a scalar context its first element is used. All elements of one vector are of the same type. All variables, including a vector of the variable length, must have at least one element. There are four different variable types: integer, real, string and logical. The real constant is (Fortran real number representation): [+_-][digits][.][digits][-e_E_d_D"-+_-"digits] The integer constant is (Fortran integer number representation): [+_-][digits] The logical constant can be either on or off (case insensitive). The string constant can contain any character except for a prime '. It can be optionally enclosed in primes. If it is not quoted it is converted to upper case and its extent is determined by the position of the blanks on each side of the contiguous string of non-blank characters. 5.2 Top Commands There are `flow control' and `performing' commands. If general, the `performing' commands have the following syntax: ACTION [ASSIGNMENT, ASSIGNMENT, . . . , ASSIGNMENT] ACTION specifies what action to take. ASSIGNMENT sets the variable to the specified value. The values assigned in this way are kept until the next assignment. For example, CALL ROUTINE = 'routine_name', IVAR = 3 sets the integer variable IVAR to 3 and then calls routine routine_name; if IVAR is not changed in the routine, its value will remain to be 3 after the call to the routine. There can be any number of assignments in a command. They must be separated by commas. The assignment is of the form: = [-] The `=' character is optional (can be replaced with a blank). 5.2. TOP COMMANDS 115 and can be assigned to each other. When a real number is assigned to an integer variable, the decimal places are ignored. That is, the result is the same as if the Fortran function IFIX() was used. There must be no space between the optional - and . If a vector variable is assigned to a variable, all its elements are used. Real, integer, and logical variables can also be assigned to a string variable. The conversion of a real variable to a string value is guided by the Top variable NUMBER_PLACES which is of type . The first element of NUMBER_PLACES sets the number of places before the decimal point, and the second element the number of places after the decimal point. If the latter is -1, an integer number without a decimal point is obtained, if 0 there is a decimal point without any decimal places. Assignments can follow any command, except DO, END_DO, GO_TO, LABEL, STOP, and END_- SUBROUTINE. 5.2.1 DEFINE__INTEGER _ define integer variables Options: VARIABLES = '' variable names Description: This command defines user integer variables. All variables used in the Top program must be defined. An exception are the pre-defined Top variables listed at the end of this section. 5.2.2 DEFINE__LOGICAL _ define logical variables Options: VARIABLES = '' variable names Description: This command defines user logical variables. 5.2.3 DEFINE__REAL _ define real variables Options: VARIABLES = '' variable names Description: This command defines user real variables. 5.2.4 DEFINE__STRING _ define string variables Options: VARIABLES = '' variable names Description: This command defines user string variables. 5.2.5 SET _ set variable Command: SET [ASSIGNMENT, [ASSIGNMENT, . . . [ASSIGNMENT]]] Description: This command sets the values of variables of any of the four types. See the description of AS- SIGNMENT above. 116 CHAPTER 5. TOP , MODELLER SCRIPTING LANGUAGE There can be Unix shell environment variables in any input or output filename. The environment variables have to be in the format ${VARNAME} or $(VARNAME). Also, four predefined macros are available for string variables: Four predefined macros are available for string variables: o '${LIB}' is expanded into $LIB_APPLICATION shell environment variable, where APPLICATION is the name-version of the program (e.g., MODELLER4); o '${DIR}' is expanded into the Top variable DIRECTORY, o '${JOB}' is expanded into the root of the Top script filename, o '${DEFAULT}' is expanded into (ROOT_NAME)(FILE_ID)(ID1)(ID2)(FILE_EXT), where ROOT_NAME, FILE_ID, ID1, ID2, and FILE_EXT are Top variables. FILE_ID is a string that may be set to 'default'. In that case, a hard-wired short string is used instead of FILE_ID. Otherwise, the explicitly specified FILE_ID is applied instead. In any case, FILE_ID is not modified by the filename generation routine so that it can be used more than once without resetting it to the 'default' value. Four digits are used for both ID1 and ID2. For example, '2ptn.B99990001' results from ROOT_NAME = '2ptn', FILE_EXT = '.B', ID1 = 9999, and ID2 = 1. 5.2.6 OPERATE _ perform mathematic operation Options: OPERATION = 'SUM' operation to perform: 'SUM' _ 'MULTIPLY' _ 'DIVIDE' _ 'POWER' _ 'MOD' RESULT = '' variable name for the result of operation ARGUMENTS = 0.00 real arguments to the math operation Description: This command performs a specified mathematical operation. There can be up to MRPRM (120) operands for the 'SUM' and 'MULTIPLY' operations, but only two for 'DIVIDE' and 'POWER'. The RESULT value has to be the name of a real variable. 5.2.7 STRING__OPERATE _ perform string operation Options: OPERATION = 'SUM' operation to perform: CONCATENATE RESULT = '' variable name for the result of operation STRING_ARGUMENTS = '' arguments for string operation Description: This command performs a specified string operation. There can be up to MSPRM (130) operands for the CONCATENATE operation. The RESULT value has to be a name of the string variable. 5.2.8 RESET _ reset Top Description: This command resets the internal state of Top and its predefined variables to their initial values. It does this by calling the initialization routine that reads the 'top.ini' file. This command also undefines all user defined variables. 5.2.9 OPEN _ open input file Options: IO_UNIT = 21 IO unit for file operations OBJECTS_FILE = 'top.out' filename 5.2. TOP COMMANDS 117 FILE_ACCESS = 'SEQUENTIAL' file access: 'SEQUENTIAL' _ 'APPEND' FILE_STATUS = 'UNKNOWN' file status: 'UNKNOWN' _ 'OLD' _ 'NEW' Description: This command opens a specified file on the specified I/O stream for formatted access. Fortran conventions apply to FILE_ACCESS and FILE_STATUS. 5.2.10 WRITE _ write Top objects Options: IO_UNIT = 21 IO unit for file operations OBJECTS = '' variable names or constants NUMBER_PLACES = 5 2 pre- and post-decimal point places OUTPUT_DIRECTORY = '' output directory Description: This command writes the specified objects to a single line which is then written to a selected I/O stream. Each element of the OBJECTS vector is first tested if it is a name of a variable of any type. If it is the contents of that variable is written out. If it is not, the element is treated as a string constant. The first and second element of NUMBER_PLACES set the numbers of places before and after the decimal point, respectively, for real and integer objects. 5.2.11 READ _ read record from input file Options: IO_UNIT = 21 IO unit for file operations RECORD = 'undefined' contents of the input line Description: This command reads a line from the file on the I/O channel IO_UNIT. The line goes into the string variable RECORD. 5.2.12 CLOSE _ close an input file Options: IO_UNIT = 21 IO unit for file operations Description: This command closes a specified I/O stream. 5.2.13 WRITE__TOP _ write the Top program Options: FILE = 'default' partial or complete filename OUTPUT_DIRECTORY = '' output directory FILE_ACCESS = 'SEQUENTIAL' file access: 'SEQUENTIAL' _ 'APPEND' Description: This command writes the current Top program in memory to a specified file. 118 CHAPTER 5. TOP , MODELLER SCRIPTING LANGUAGE 5.2.14 SYSTEM _ execute system command Options: COMMAND = 'nothing' UNIX command Description: This command executes the specified UNIX command. 5.2.15 INQUIRE _ check if file exists Options: FILE = 'default' partial or complete filename Description: This command assigns 1 to FILE_EXISTS if the specified file exists, otherwise it assigns 0. You can use it with a subsequent IF command for the flow control. 5.2.16 GO__TO _ jump to label Command: GO_TO Description: The `go_to' statement, which transfers execution to the Top statement occurring after the LABEL statement with the same name. 5.2.17 LABEL _ place jump label Command: LABEL Description: This command labels a target position for the GO_TO statement with the same name. 5.2.18 INCLUDE _ include Top file Options: INCLUDE_FILE = '__mod' include file name Description: This command includes a Top file INCLUDE_FILE. You do not have to specify the .top extension. First, the given filename is tried. Second, the directory specified in the $BIN_APPLICATION environment variable is prefixed and the open function is tried again. INCLUDE command is useful for including standard subroutines. 5.2.19 CALL _ call Top subroutine Options: ROUTINE = '' subroutine name Description: This command calls a Top subroutine ROUTINE. 5.2. TOP COMMANDS 119 5.2.20 SUBROUTINE _ define Top subroutine Options: ROUTINE = '' subroutine name Description: This command is the first Top statement for any routine. It has to have a matching END_- SUBROUTINE. No nesting of subroutine definitions is allowed, although the definitions can be located anywhere in a file. 5.2.21 RETURN _ return from Top subroutine Description: This command will exit the execution from the current routine. It is optional. 5.2.22 END__SUBROUTINE _ end definition of Top subroutine Description: This command has to be present at the end of each routine. Possibly used instead of RETURN if RETURN not present. 5.2.23 DO _ DO loop Command: DO VAR = START, END, STEP commands END_DO Description: Commas after START and END can be omitted. This loop is exactly like a Fortran DO loop except that real values are allowed for any of the four controlling variables. VAR must be a variable, while START, END and STEP can also be constants. 5.2.24 IF _ conditional statement for numbers Options: OPERATION = 'SUM' EQ _ GT _ LT _ GE _ LE _ NE Description: This command performs conditional IF operation on two real arguments. The possible operations are equal (EQ), greater than (GT), less than (LT), greater or equal (GE), less or equal (LE), and not equal (NE). If the condition is true, the command specified in the THEN variable is executed. Otherwise the command in the ELSE variable is executed. Typically, these commands are GO_TO statements. 5.2.25 STRING__IF _ conditional statement for strings Options: OPERATION = 'SUM' EQ _ NE _ INDEX STRING_ARGUMENTS = '' arguments for string operation THEN = 'undefined' statement when IF evaluates to T ELSE = 'undefined' statement when IF evaluates to F Description: This command performs conditional IF operation on two string arguments. The possible operations are equal (EQ), not equal (NE), and the Fortran index() function (INDEX), which returns true if there 120 CHAPTER 5. TOP , MODELLER SCRIPTING LANGUAGE is `argument2' substring within `argument1'. If the condition is true, the command specified in the THEN variable is executed. Otherwise the command in the ELSE variable is executed. Typically, these commands are GO_TO statements. 5.2.26 STOP _ exit Top Description: This command stops the execution of the Top program. 5.3 Predefined Top variables __________________________________________________ __Name___________________________Type_____________ ARGUMENTS IO_UNIT ID1 ID2 NUMBER_PLACES FILE_EXISTS OUTPUT_CONTROL STOP_ON_ERROR OBJECTS VARIABLES ROUTINE ROOT_NAME DIRECTORY FILE_ID OPERATION RESULT STRING_ARGUMENTS OBJECTS_FILE INCLUDE_FILE FILE RECORD THEN ELSE COMMAND FILE_EXT OUTPUT_DIRECTORY FILE_ACCESS __FILE_STATUS______________________ ______Table_5.2:_Predefined_Top____variables___________ Chapter 6 Methods 6.1 Dynamic programming for sequence and structure comparison and searching In this section, the basic dynamic programming method for sequence alignment is described [Sali & Blundell, 1990]. This method forms the core of the pairwise and multiple sequence and structure comparisons as well as of the sequence database searching. 6.1.1 Pairwise comparison The residue by residue scores Wij can be used directly in the sequence alignment algorithm of Needleman & Wunsch [Needleman & Wunsch, 1970] to obtain the comparison of two protein sequences or structures. The only difference between the two types of comparison is in the type of the comparison matrix. In the case of sequence, the amino acid substitution matrix is used. In the case of 3D structure, the Euclidean distance (or some function of it) between two equivalent atoms in the current optimal superposition is used [Sali & Blundell, 1990]. The problem of the optimal alignment of two sequences as addressed by the algorithm of Needleman & Wunsch is as follows. We are given two sequences of elements and an M times N score matrix W where M and N are the numbers of elements in the first and second sequence. The scoring matrix is composed of scores Wij describing differences between elements i and j from the first and second sequence respectively. The goal is to obtain an optimal set of equivalences that match elements of the first sequence to the elements of the second sequence. The equivalence assignments are subject to the following "progression rule": for elements i and k from the first sequence and elements j and l from the second sequence, if element i is equivalenced to element j, if element k is equivalenced to element l and if k is greater than i, l must also be greater than j. The optimal set of equivalences is the one with the smallest alignment score. The alignment score is a sum of scores corresponding to matched elements, also increased for occurrences of non-equivalenced elements (ie gaps). For a detailed discussion of this and related problems see [Sankoff & Kruskal, 1983]. We summarize the dynamic programming formulae used by Modeller to obtain the optimal alignment since they differ slightly from those already published [Sellers, 1974, Gotoh, 1982]. The recursive dynamic programming formulae that give a matrix D are: 121 122 CHAPTER 6. METHODS 8 < Pi;j Di;j = min : Di-1;j-1 + Wi;j Qi;j ae Pi;j = min Di-1;jP+ g(1) i-1;j + v ae * *(6.1) Qi;j = min Di;j-1Q+ g(1) i;j-1 + v where g(l) is a linear gap penalty function: g(l) = u + v . l : * *(6.2) Note that only a vector is needed for the storage of P and Q. The uppermost formula in Eq. 6.1 is calculated for i = M and j = N . Variable l is a gap length and parameters u and v are gap-penalty constants. The arrays D, P and Q are initialized as follows: ae Di;0 = 0;g(i - e); e ff (6* *.55) where f is an upper bound and pgauss is given in Eq. 6.38. A similar equation relying on the first derivatives of a Gaussian p holds for the first derivatives of an upper bound. Cosine restraint This is usually used for dihedral angles f : c = |b| - b cos(nf + a) (6* *.56) where b is Charmm force constant, a is phase shift (tested for 0 and 180O ), and n is periodicity (tested for 1, 2, 3, 4, 5, and 6). The Charmm phase value from the Charmm parameter library corresponds to a - 180O . The force constant b can be negative, in effect offsetting the phase a for 180O compared to the same but positive force constant. _d_c_= bn sin(nf + a) (6* *.57) d f Coulomb restraint c = qiqj_fs(f; f1; f2) (6* *.58) 8 >< 1 ; f f1 2(f +2f-3f ) s(f; f1; f2) = > (f2-f)___2______1____(f;-ff)3o< f f2 (6* *.59) : 0 ; 2 f1< f2 where qi and qj are the atomic charges of atoms i and j, obtained from the Charmm topology file, that are at a distance f . Function s(f; f1; f2) is a switching function that smoothes the potential down to zero in the interval from f1 to f2 (f2 > f1). The total Coulomb energy of a molecule is a sum over all pairs of atoms that are not in the same bonds or bond angles. 1-4 energy for the 1-4 atom pairs in the same dihedral angle corresponds to the ELEC14 Modeller term; the remaining longer-range contribution corresponds to the ELEC term. The first derivatives are: _dc_ = - _c__s + c _ds_ (6* *.60) d f fij d f 8 0 ; f f _ds_ = < 6(f2-f)(f1-f)__ ; f <1f f (6* *.61) d f : 0 ;(f2-f1)3 f1< f 2 2 Lennard-Jones restraint Usually used for non-bonded distances: c = ( A__f)12 - ( B__f)6s(f; f1; f2) (6* *.62) The parameters f1 and f2 of the switching function can be different from those in Eq. 6.59. The parameters A and B are obtained frompthe_Charmm_ parameter file (NONBOND section) where they are given as Ei and rj such that Eij(f ) = -4 EiEj [(aeij=f )12 - (aeij=f )6] in kcal/mole for f in angstroms and ae = (ri+ rj)=21=6 ; the minimum 130 CHAPTER 6. METHODS p _______ of E is - EiEj at f = (ri + rj), and its zero is at f = ae. The total Lennard-Jones energy should be evaluated over all pairs of atoms that are not in the same bonds or bond angles. The parameters A and B for 1-4 pairs in dihedral angles can be different from those for the other pairs; they are obtained from the second set of Ei and ri in the Charmm parameter file, if it exists. 1-4 energy corresponds to the LJ14 Modeller term; the remaining longer-range contribution corresponds to the LJ term. The first derivatives are: _d_c_ = Cs__ - C _ds_ (6* *.63) d f f d f C = -12( A__f)12 + 6( B__f)6 (6* *.64) Spline restraint Any restraint form can be represented by a cubic spline [Press et al., 1992]: c = Acj + Bcj+1 + Cc00j+ Dc00j+1 (6* *.65) A = fj+1__-_f___f (6* *.66) j+1 - fj B = 1 - A (6* *.67) C = 1_6(A3 - A)(fj+1 - fj)2 (6* *.68) D = 1_6(B3 - B)(fj+1 - fj)2 (6* *.69) where fj f fj+1 . The first derivatives are: _d_c_= cj+1__-_cj__- 3A2_-_1___(f - f )c00+ 3B2_-_1___(f - f )c00 (6* *.70) d f fj+1 - fj 6 j+1 j j 6 j+1 j j+1 The values of c and c0 beyond f1 and fn are obtained by linear interpolation from the termini. A violation of the restraint is calculated by finding the global minimum. A relative violation is estimated by using a standard deviation (e.g., force constant) obtained by fitting a parabola to the global minimum. Variable spacing of spline points could be used to save on memory. However, this would increase the execution time, so it is not used. Symmetry restraint The asymmetry penalty added to the objective function is defined as X Fsymm = !i!j(dij - d0ij)2 (6* *.71) i i distances. 6.4. LIST OF COMMANDS, ARGUMENTS, AND DEFAULT VALUES 131 6.4 List of commands, arguments, and default values The top.ini file contains the list of all Modeller commands, arguments, and default values of arguments. --- COMMANDS: 1 no_action 2 SET 3 STOP 4 LABEL 5 GO_TO 6 DEFINE_INTEGER 7 DEFINE_REAL 8 END_DO 9 DO 10 CALL 11 RESET 12 WRITE 13 OPERATE 14 STRING_OPERATE 15 DEFINE_STRING 16 DEFINE_LOGICAL 17 SUBROUTINE 18 END_SUBROUTINE 19 INCLUDE 20 RETURN 21 READ 22 OPEN 23 CLOSE 24 IF 25 WRITE_TOP 26 SYSTEM 27 INQUIRE 28 STRING_IF 31 READ_RESTRAINTS 32 READ_SCHEDULE 33 WRITE_RESTRAINTS 34 READ_MODEL 35 SUPERPOSE 36 COMPARE 37 WRITE_MODEL 38 WRITE_MODEL2 39 OPTIMIZE 40 ENERGY 41 READ_MODEL2 42 PICK_ATOMS 43 ROTATE_DIHEDRALS 44 READ_ALIGNMENT 45 DELETE_ALIGNMENT 46 SWITCH_TRACE 47 PATCH 48 TRANSFER_RES_NUMB 49 MAKE_SCHEDULE 50 WRITE_SCHEDULE 51 ID_TABLE 52 MAKE_WRITE_UHBD_LIB 53 BUILD_MODEL 54 GENERATE_TOPOLOGY 55 MAKE_RESTRAINTS 56 READ_TOPOLOGY 57 READ_PARAMETERS 58 WRITE_TOPOLOGY_MODEL 132 CHAPTER 6. METHODS 59 MAKE_TOPOLOGY_MODEL 60 ROTATE_MODEL 61 WRITE_ALIGNMENT 62 REORDER_ATOMS 63 PICK_RESTRAINTS 64 CONDENSE_RESTRAINTS 65 DELETE_RESTRAINT 66 ADD_RESTRAINT 67 TRANSFER_XYZ 68 RANDOMIZE_XYZ 69 DEBUG_FUNCTION 70 REORDER2_ATOMS 71 PICK_HOT_ATOMS 72 REINDEX_RESTRAINTS 73 ALIGN 74 SEQUENCE_SEARCH 75 ALIGN3D 76 ORIENT_MODEL 77 DESCRIBE 78 SEQUENCE_COMPARISON 79 MALIGN3D 80 MALIGN 81 SEQUENCE_TO_ALI 82 PMF 83 MUTATE_MODEL 84 PATCH_DISULFIDES 85 WRITE_DATA 86 PRINCIPAL_COMPONENTS 87 READ_ALIGNMENT2 88 COMPARE_ALIGNMENTS 89 ALIGN_CONSENSUS 90 QUICK_AND_DIRTY 91 SPLINE_RESTRAINTS 92 RENAME_SEGMENTS 93 DEFINE_SYMMETRY 94 REPORT_MODELLER 95 CHECK_ALIGNMENT 96 ALIGN2D 97 COLOR_ALN_MODEL 98 IUPAC_MODEL 99 DENDROGRAM 100 EXPAND_ALIGNMENT 101 UNBUILD_MODEL 102 READ_ATOM_CLASSES 103 ENERGY_PROFILE --- KEYWORDS: 1 REAL ARGUMENTS 0 0.00 # real arguments to the math operation 31 REAL UPDATE_DYNAMIC 1 0.39 # when to update non-bonded pairs list 32 REAL MATRIX_OFFSET 1 0.00 # substitution matrix offset for local alignment 33 REAL SPHERE_STDV 1 0.05 # standard deviation of soft-sphere repulsion 34 REAL VIOL_REPORT_CUT 31 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 999 999 999 999 4.* *5 4.5 4.5 4.5 4.5 4.@ 35 REAL DEBUG_FUNCTION_CUTOFF 3 2.0 0.05 0.1 # cutoffs for reporting differences between numerical and analyti* *cal derivatives: absolu@ 36 REAL TRANSLATION 3 0.0 0.0 0.0 # translation vector for MODEL 37 REAL SA_STEP 1 0.2 # amplitude of the Monte Carlo steps 38 REAL SA_MVFRACT 1 0.8 # fraction of accepted Monte Carlo steps 39 REAL SA_TFACTR 1 0.9 # factor for temperature deacrease in MC SA 40 REAL SA_T0 1 40.0 # starting SA temperature 41 REAL SA_TMIN 1 0.01 # final SA temperature 42 REAL MIN_ATOM_SHIFT 1 0.010 # minimal atomic shift for the optimization convergence test 43 REAL DEVIATION 1 0.0 # coordinate randomizaton amplitude in angstroms 44 REAL RMS_CUTOFFS 11 3.5 3.5 60 60 15 60 60 60 60 60 60 # cutoffs for RMS, DRMS, Alpha Phi* * Psi Omega chi1 chi2 @ 6.4. LIST OF COMMANDS, ARGUMENTS, AND DEFAULT VALUES 133 45 REAL TEMPERATURE 1 293.0 # temperature for MD simulation in K 46 REAL MD_TIME_STEP 1 4.0 # time step for MD in fs 47 REAL RADII_FACTOR 1 0.82 # factor for van der Waals radii 48 REAL LENNARD_JONES_SWITCH 2 6.5 7.5 # the range for Lennard-Jones interaction smoothing to 0 49 REAL COULOMB_SWITCH 2 6.5 7.5 # the range for Coulomb interaction smoothing to 0 50 REAL ROTATION_MATRIX 9 1 0 0 0 1 0 0 0 1 # rotation matrix for MODEL 51 REAL BASIS_RELATIVE_WEIGHT 1 0.05 # the cutoff weight of basis pdf's for their removal 52 REAL SYMMETRY_WEIGHT 1 1.0 # the weight of the symmetry objective function term 53 REAL MAXIMAL_DISTANCE 1 999. # maximal distance for distance restraints 54 REAL RESTRAINTS_FILTER 31 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999* * -999 -999 -999 -999 @ 55 REAL RESTRAINT_PARAMETERS 0 3 1 3 3 4 2 0 0.0 0.087 # restraint parameters 56 REAL SPHERE_RADIUS 1 10.0 # sphere radius for atoms selection 57 REAL SELECTION_SLAB 5 -9999 9999 0 0 0 # slab for atoms selection: "Z-dz1" "Z-dz2" "Z-xtrans" "Z* *-ytrans" "Z-ztrans" 58 REAL PICK_HOT_CUTOFF 1 4.0 # radius for picking hot atoms 59 REAL CAP_ATOM_SHIFT 1 0.2 # limit for atomic shifts in optimization 60 REAL MOLPDF 1 0.0 # value of objective function 61 REAL GAP_PENALTIES_3D 2 0.0 1.75 # gap creation and extension penalties for structure/structure super* *position 62 REAL CONTACT_SHELL 1 4.0 # distance cutoff for calculation of the non-bonded pairs list 63 REAL RESTRAINT_STDEV 2 0.0 1.0 # transforming factors for standard deviations (y=a+bx) in models 1-* *-6 or standard deviati@ 64 REAL PMF_GRID 8 2.0 0.5 20 36 18 0 180 1 # translation and rotation grid for PMF c* *alculation 65 REAL RELATIVE_DIELECTRIC 1 1.0 # relative dielectric constant 66 REAL ROTATION_ANGLE 1 0.0 # rotation of MODEL around axis [degrees] 67 REAL ROTATION_AXIS 3 1.0 0.0 0.0 # rotation axis for MODEL 68 REAL SPLINE_DX 1 0.5 # interval size for splining restraints 69 REAL SPLINE_RANGE 1 4.0 # range of the splines 70 REAL GAP_PENALTIES_2D 6 0.5 0.5 0.5 0.5 1.0 6.0 # gap penalties for sequence/structure alignment: h* *elix, beta, accessibili@ 71 REAL SCHEDULE_SCALE 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * *1 1 # factors for ph@ 72 REAL CLUSTER_CUT 1 -1.0 # definition of a cluster 73 REAL GAP_PENALTIES_1D 2 -400 -150 # gap creation and extension penalties for sequence/sequence alignm* *ent 74 REAL FAST_SEARCH_CUTOFF 1 1.0 # if FAST_SEARCH is ON only sequences with database scan significance hi* *gher than this value @ 75 REAL VIOL_REPORT_CUT2 31 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0* * 2.0 2.0 2.0 2.0 2.0@ 1 INTEGER IO_UNIT 1 21 # IO unit for file operations 2 INTEGER ID1 1 1 # ID1 for filename construction 3 INTEGER ID2 1 1 # ID2 for filename construction 4 INTEGER NUMBER_PLACES 2 5 2 # pre- and post-decimal point places 5 INTEGER FILE_EXISTS 1 0 # an output flag: 0 _ 1 6 INTEGER OUTPUT_CONTROL 4 1 0 0 1 # selects output, flow-control msgs, warnings, errors 7 INTEGER STOP_ON_ERROR 1 1 # whether to stop on error 31 INTEGER SCHEDULE_STEP 1 1 # schedule step for optimization 32 INTEGER ROUTINE_TYPE 1 1 # generic routine type for a miscellaneous command 33 INTEGER NLOGN_USE 1 15 # number of residues at which to begin using the N Log N non-bonded * * pairs routine 34 INTEGER SA_MOVSPERATM 1 30 # 35 INTEGER RESIDUE_GROUPING 1 1 # 36 INTEGER MAX_ITERATIONS 1 200 # maximal iterations in optimization 37 INTEGER RAND_SEED 1 -8123 # random seed from -50000 to -2 38 INTEGER COMPARE_MODE 1 3 # selects the type of comparison: 1 _ 2 _ 3 39 INTEGER EXTEND_HOT_SPOT 1 0 # whether to extend hot spots 40 INTEGER TOPOLOGY_MODEL 1 3 # selects topology library: 1--9 41 INTEGER MODELLER_STATUS 1 0 # indicates the severity of the error 42 INTEGER N_SCHEDULE 1 1 # the number of steps in the optimization schedule 43 INTEGER DISTANCE_RSR_MODEL 1 1 # the model for calculating distance restraints: 1--7 44 INTEGER ACCESSIBILITY_TYPE 1 8 # type of solvent accessibility: 1--10 45 INTEGER RESIDUE_SPAN_RANGE 2 0 99999 # range of residues spanning the allowed distances; for MAKE_RESTRAIN* *TS, PICK_RESTRAINTS, no@ 46 INTEGER MAX_GAP_LENGTH 1 999999 # maximal length of gap in protein comparisons 47 INTEGER OPTIMIZATION_METHOD 1 -999 # type of optimization method: 1 _ 3 48 INTEGER SELECTION_STEP 1 1 # increase in residue index when selecting residues 49 INTEGER NUMB_OF_SEQUENCES 1 1 # number of sequences in the alignment 50 INTEGER TRACE_OUTPUT 1 0 # modulus for writing information about optimization iterations: 0 for no* *thing 51 INTEGER SEARCH_TOP_LIST 1 20 # the length of the output hits list 52 INTEGER EQUILIBRATE 1 999999 # equilibrate during MD every that many steps 53 INTEGER MAX_GAPS_MATCH 1 1 # 134 CHAPTER 6. METHODS 54 INTEGER ALIGN_BLOCK 1 1 # the last sequence in the first block of sequences 55 INTEGER PICK_ATOMS_SET 1 1 # index of the selected atoms set: 1 _ 2 _ 3 56 INTEGER PMF_INDICES 0 0 0 0 0 # 57 INTEGER SEARCH_RANDOMIZATIONS 1 0 # number of randomizations for calculating the significance of a sequence/se* *quence similarity 58 INTEGER OFF_DIAGONAL 1 100 # to speed up the alignment 59 INTEGER RESTRAINT_GROUP 1 26 # physical restraint group 60 INTEGER OVERHANG 1 0 # un-penalized overhangs in protein comparisons 61 INTEGER SPLINE_SELECT 3 4 1 9 # specification of the restraints to be splined: -"tt form feature * * group" 62 INTEGER LIBRARY_SCHEDULE 1 1 # selects schedule from the $SCHED_LIB library 63 INTEGER DYNAMIC_FLAG 1 0 # 64 INTEGER SPLINE_MIN_POINTS 1 5 # have at least as many intervals in a spline 65 INTEGER SHEET_H-BONDS 1 7 # specify hydrogen bonds in a beta-sheet 66 INTEGER SMOOTHING_WINDOW 1 1 # profiles are smoothed over 2*SW + 1 residues 1 STRING OBJECTS 0 '' # variable names or constants 2 STRING VARIABLES 0 '' # variable names 3 STRING ROUTINE 0 '' # subroutine name 4 STRING ROOT_NAME 1 'undf' # root of a filename for filename construction 5 STRING DIRECTORY 1 '' # directory list (e.g., "Z-dir1:dir2:dir3:./:/") 6 STRING FILE_ID 1 'default' # file id for filename construction 7 STRING OPERATION 1 'SUM' # operation to perform: "Z-SUM" _ "Z-MULTIPLY" _ "Z-DIVIDE" _ "Z-POW* *ER" _ "Z-MOD" 8 STRING RESULT 0 '' # variable name for the result of operation 9 STRING STRING_ARGUMENTS 0 '' # arguments for string operation 10 STRING OBJECTS_FILE 1 'top.out' # filename 11 STRING INCLUDE_FILE 1 '__mod' # include file name 12 STRING FILE 1 'default' # partial or complete filename 13 STRING RECORD 1 'undefined' # contents of the input line 14 STRING THEN 1 'undefined' # statement when IF evaluates to T 15 STRING ELSE 1 'undefined' # statement when IF evaluates to F 16 STRING COMMAND 1 'nothing' # UNIX command 17 STRING FILE_EXT 1 '' # file extension for filename construction 18 STRING OUTPUT_DIRECTORY 1 '' # output directory 19 STRING FILE_ACCESS 1 'SEQUENTIAL' # file access: "Z-SEQUENTIAL" _ "Z-APPEND" 20 STRING FILE_STATUS 1 'UNKNOWN' # file status: "Z-UNKNOWN" _ "Z-OLD" _ "Z-NEW" 31 STRING BUILD_METHOD 1 '3D_INTERPOLATION' # method for building coordinates: "Z-INTERNAL_COORDINATES" * * _ "Z-3D_INTERPOLATION" 32 STRING DIHEDRALS 0 'PHI' 'PSI' 'CHI1' 'CHI2' 'CHI3' 'CHI4' # dihedral angle type selection: "* *Z-phi" _ "Z-psi" _ @ 33 STRING RES_TYPES 1 'ALL' # residue type selection 34 STRING ATOM_TYPES 1 'ALL' # atom type selection 35 STRING VARIABILITY_FILE 1 'undefined' # output filename 36 STRING ALIGN_CODES 0 'all' # codes of proteins in the alignment 37 STRING ATOM_FILES 0 '' # complete or partial atom filenames 38 STRING OUTPUT 1 'LONG' # what and/or how to output 39 STRING CHANGE 1 'RANDOMIZE' # what to do: "Z-RANDOMIZE" _ "Z-OPTIMIZE" 40 STRING FIT_ATOMS 1 'CA' # atom type(s) being superposed 41 STRING MODEL_FORMAT 0 'PDB' # selects input atom file format: "Z-PDB" _ "Z-CHARMM" _ "Z-UHBD" 42 STRING SEQUENCE 1 'undefined' # protein code in the alignment whose topology is constructed 43 STRING RESTRAINT_TYPE 0 'STEREO' # restraint type to be calculated: "Z-STEREO" _ "Z-BOND" _ "Z-ANGLE* *" _ "Z-IMPROPER" _ "Z@ 44 STRING ALIGNMENT_FORMAT 0 'PIR' # format of the alignment file: "Z-PIR" _ "Z-PAP" _ "Z-QUANTA" _ "Z-* *INSIGHT" 45 STRING FILE_FORMAT 1 'FORMATTED' # file format: "Z-FORMATTED" _ "Z-UNFORMATTED" 46 STRING ALIGNMENT_FEATURES 0 'INDICES CONSERVATION' # what alignment features to write out: "Z-ACCURACY" _ * * "Z-HELIX" _ "Z-BETA" @ 47 STRING RESIDUE_TYPE 1 'undefined' # 48 STRING MATRIX_FILE 1 'family.mat' # the filename of the pairwise distance matrix 49 STRING BASIS_PDF_WEIGHT 1 'LOCAL' # a method for calculation of basis pdf weights: "Z-LOCAL" _ "Z-GLO* *BAL" 50 STRING DISTANCE_ATOMS 2 'CA' 'CA' # atom types for distance generation 51 STRING MDT_LIB_FILE 1 'mnch1.mdt' # file with probability distributions for restraints 52 STRING BIN_LIB_FILE 1 '$(LIB)/mdt.bin' # file with bin definitions for restraints 53 STRING ATOM_IDS 0 '' # atom ids: atom:residue_id[:chain_id] 54 STRING SPHERE_CENTER 2 'undefined' 'undefined' # '"#RES1:C' 'ATOM_NAME' 55 STRING SELECTION_MODE 1 'ATOM' # selecting what: "Z-ATOM" _ "Z-RESIDUE" 56 STRING SELECTION_SEARCH 1 'SEGMENT' # search method: "Z-SPHERE" _ "Z-SEGMENT" 57 STRING SELECTION_STATUS 1 'INITIALIZE' # what to do with selected atoms: "Z-ADD" _ "Z-REMOVE" _ "Z-IN* *ITIALIZE" 58 STRING SELECTION_SEGMENT 2 '@:@' 'X:X' # RES:CHN ids for the first and last residues in a chain/segment 6.4. LIST OF COMMANDS, ARGUMENTS, AND DEFAULT VALUES 135 59 STRING SELECTION_FROM 1 'ALL' # selecting from: "Z-ALL" _ "Z-SELECTED" 60 STRING KEEP_RESTRAINTS 1 'ONE_ATOM' # what static restraints to keep: "Z-ALL_ATOMS" "OR" "Z-ONE_ATOM" 61 STRING MD_RETURN 1 'FINAL' # return MODEL with "Z-MINIMAL" energy or "Z-FINAL" MODEL 62 STRING ATOM_CLASSES_FILE 1 '$(LIB)/atmcls-168.lib' # library with atom class definitions for MODELLER non-* *bonded restraints 63 STRING RR_FILE 1 '$(LIB)/as1.sim.mat' # input residue-residue scoring file 64 STRING SEARCH_CHAINS_FILE 1 '$(LIB)/CHAINS_all.seq' # file with a list of sequence codes 65 STRING MODEL_SEGMENT 2 '@:@' 'X:X' # segment to be read in 66 STRING MODEL2_SEGMENT 2 '@:@' 'X:X' # segment to be read in 67 STRING ATOM_FILES_DIRECTORY 1 './' # input atom files directory list (e.g., "Z-dir1:dir2:dir3:./:/") 68 STRING SEARCH_SORT 1 'LONGER' # which sequence to use for normalization when sorting the hit li* *st: "Z-SHORTER" _ "Z-L@ 69 STRING RESTRAINTS_FORMAT 1 'MODELLER' # format of the restraints file: "Z-MODELLER" _ "Z-USER" 70 STRING SEARCH_CHAINS_LIST 1 '$(LIB)/CHAINS_3.0_30_XN.cod' # file with sequences 71 STRING SEGMENT_IDS 0 '' # new segment ids 72 STRING RESIDUE_IDS 0 '' # residue id (number:chnid) 73 STRING ALIGN_WHAT 1 'BLOCK' # what to align in ALIGN; "Z-BLOCK" _ "Z-ALIGNMENT" _ "Z-LAST" 31 LOGICAL FIT 1 on # whether to do pairwise least-squares fitting or ALIGN2D alignment 32 LOGICAL SUPERPOSE_REFINE 1 off # whether to refine the superposition 35 LOGICAL DYNAMIC_SPHERE 1 on # whether to use dynamic soft-sphere repulsion terms 36 LOGICAL DYNAMIC_LENNARD 1 off # whether to use dynamic Lennard-Jones energy terms 37 LOGICAL DYNAMIC_COULOMB 1 off # whether to use dynamic Coulomb energy terms 38 LOGICAL WRITE_FIT 1 off # whether to write out fitted coordinates to .fit files 39 LOGICAL ASGL_OUTPUT 1 off # whether to write output for ASGL 40 LOGICAL ADD_RESTRAINTS 1 off # whether to add new restraints to existing restraints 41 LOGICAL ADD_SEGMENT 1 off # whether to add the new segments to the list of segments 42 LOGICAL REMOVE_GAPS 1 on # whether to remove all-gap positions in input alignment 44 LOGICAL LOCAL_ALIGNMENT 1 off # whether to do local as opposed to global alignment 45 LOGICAL WATER_IO 1 off # whether to read water coordinates 46 LOGICAL HETATM_IO 1 off # whether to read HETATM coordinates 47 LOGICAL HYDROGEN_IO 1 off # whether to read hydrogen coordinates 48 LOGICAL INITIALIZE_XYZ 1 on # whether to use IC entries to calculate all coordinates 49 LOGICAL ADD_SEQUENCE 1 off # whether to add the new sequences to the existing alignment 50 LOGICAL ALIGN3D_TRF 1 off # whether to transform the distances before dynamic programming 51 LOGICAL PATCH_DEFAULT 1 on # whether to do default NTER and CTER patching 52 LOGICAL INTERSEGMENT 1 on # whether to restrain inter-segment non-bonded pairs 53 LOGICAL ALIGN3D_REPEAT 1 off # do several starts to maximize number of equivalent positions 54 LOGICAL Lundefined 0 55 LOGICAL INIT_VELOCITIES 1 on # whether to initialize velocities before MD 56 LOGICAL ADD_SYMMETRY 2 off on # whether to add segment pair, add atoms to segment pair 57 LOGICAL SPLINE_ON_SITE 1 off # whether to convert restraints to splines 58 LOGICAL ADD_PARAMETERS 1 off # whether to add new parameters to existing ones 59 LOGICAL ADD_TOPOLOGY 1 off # whether to add new residue topologies to existing ones 60 LOGICAL WRITE_WHOLE_PDB 1 on # whether to write out all lines in the input PDB file 61 LOGICAL WRITE_ALL_ATOMS 1 on # whether to write all atoms, even if unselected 62 LOGICAL CURRENT_DIRECTORY 1 on # whether to write output .fit files to current directory 63 LOGICAL DETAILED_DEBUGGING 1 off # whether to evaluate energy and derivatives wrt each restraint 64 LOGICAL RENUMBER_RESIDUES 1 on # whether to renumber residues from 1 to N 65 LOGICAL DYNAMIC_MODELLER 1 off # whether to use dynamic MODELLER non-bonded restraints 66 LOGICAL FAST_SEARCH 1 off # whether to use fast sequence search or not 67 LOGICAL DATA_FILE 1 off # whether results go to a separate file or not 68 LOGICAL NORMALIZE_PROF 1 off # whether to normalize energy profiles or not 69 LOGICAL CUT_OVERHANGS 1 off # whether to cut overhangs at OVERHANG residues or not 70 LOGICAL RESIDUE_SPAN_SIGN 1 on # whether to do N*(N-1)/2 loop for atom pairs in MAKE_RESTRAINTS RESTRAI* *NT_TYPE = 'distance' --- END OF FILE The third column contains a number of values for each of the options if this number is fixed, otherwise it contains 0. You can change any command or variable name without changing the source code relying on this file, but you can not change the order of the lines. 136 CHAPTER 6. METHODS Bibliography (1970). IUPAC-IUB commission on biochemical nomenclature. abbreviations and symbols for the description of the conformation of polypeptide chains. tentative rules (1969). Biochemistry, , 3471-3479. Abola, E. E., Bernstein, F. C., Bryant, S. H., Koetzle, T., & Weng, J. (1987). Protein data bank. In: Crys- tallographic databases _ Information, content, software systems, scientific applications, (Allen, F. H., Berg- erhoff, G., & Sievers, R., eds) pp. 107-132. Data Commission of the International Union of Crystallography Bonn/Cambridge/Chester. Braun, W. & G"o, N. (1985). Calculation of protein conformations by proton-proton distance constraints: A new efficient algorithm. J. Mol. Biol. 186, 611-626. Brooks, B. R., Bruccoleri, R. E., Olafson, B. D., States, D. J., Swaminathan, S., & Karplus, M. (1983). CHARMM: A program for macromolecular energy minimization and dynamics calculations. J. Comp. Chem. 4, 187-217. Chothia, C. & Lesk, A. M. (1987). Canonical structures for the hypervariable regions of immunoglobulins. J. Mol. Biol. 196, 901-917. Dunbrack Jr., R. L., Gerloff, D. L., Bower, M., Chen, X., Lichtarge, O., & Cohen, F. E. (1997). Meeting review: the second meeting on the critical assessment of techniques for protein structure prediction (casp2), asilomar california, december 13-16, 1996. Folding & Design, 2, R27-R42. Felsenstein, J. (1985). Confidence limits on phylogenies: An approach using the bootstrap. Evolution, 39, 783-791. Flockner, H., Braxenthaler, M., Lackner, P., Jaritz, M., Ortner, M., & Sippl, M. J. (1995). Progress in fold recognition. Proteins, 23, 376-386. Godzik, A., Kolinski, A., & Skolnick, J. (1992). Topology fingerprint approach to the inverse protein folding problem. J. Mol. Biol. 227, 227-238. Gotoh, O. (1982). An improved algorithm for matching biological sequences. J. Mol. Biol. 162, 705-708. Hubbard, T. J. P. & Blundell, T. L. (1987). Comparison of solvent inaccessible cores of homologous proteins: Definitions useful for protein modelling. Protein Eng. 1, 159-171. Jones, D. T., Taylor, W. R., & Thornton, J. M. (1992). A new approach to protein fold recognition. Nature, 358, 86-89. Kabsch, W. & Sander, C. (1983). Dictionary of protein secondary structure: Pattern recognition of hydrogen- bonded and geometrical features. Biopolymers, 22, 2577-2637. Kendrew, J. C., Klyne, W., Lifson, S., Miyazawa, T., eNmethy, G., Phillips, D. C., Ramachandran, G. N., & Scheraga, H. (1970). IUPAC-IUB commission on biochemical nomenclature: Abbreviations and symbols for the description of the conformation of polypeptide chains. J. Mol. Biol. 52, 1-17. Laskowski, R. A., McArthur, M. W., Moss, D. S., & Thornton, J. M. (1993). PROCHECK: A program to check the stereochemical quality of protein structures. J. Appl. Cryst. 26, 283-291. L"uthy, R., Bowie, J. U., & Eisenberg, D. (1992). Assessment of protein models with three-dimensional profiles. Nature, 356, 83-85. 137 138 BIBLIOGRAPHY Mosimann, S., Meleshko, R., & James, M. N. G. (1995). A critical assessment of comparative molecular modeling of tertiary structures of proteins. Proteins, 23, 301-317. Needleman, S. B. & Wunsch, C. D. (1970). A general method applicable to the search for similarities in the amino acid sequence of two proteins. J. Mol. Biol. 48, 443-453. Nicholls, A., Sharp, K. A., & Honig, B. (1991). Protein folding and association: insights from the interfacial and thermodynamic properties of hydrocarbons. Proteins, 11, 281-296. Press, W. H., Teukolsky, S. A., Vetterling, W. T., & Flannery, B. P. (1992). Numerical Recipes, 2nd edition. Cambridge: Cambridge University Press. Richards, F. M. & Kundrot, C. E. (1988). Identification of structural motifs from protein coordinate data: Secondary structure and first level super-secondary structure. Proteins, 3, 71-84. Richmond, T. J. & Richards, F. M. (1978). Packing of ff-helices: Geometrical constraints and contact areas. J. Mol. Biol. 119, 537-555. Sali, A. (1995). Protein modeling by satisfaction of spatial restraints. Molecular Medicine Today, 1, 270-277. Sali, A. & Blundell, T. L. (1990). Definition of general topological equivalence in protein structures: A procedure involving comparison of properties and relationships through simulated annealing and dynamic programming. J. Mol. Biol. 212, 403-428. Sali, A. & Blundell, T. L. (1993). Comparative protein modelling by satisfaction of spatial restraints. J. Mol. Biol. 234, 779-815. Sali, A. & Overington, J. (1994). Derivation of rules for comparative protein modeling from a database of protein structure alignments. Protein Sci. 3, 1582-1596. Sali, A., Potterton, L., Yuan, F., van Vlijmen, H., & Karplus, M. (1995). Evaluation of comparative protein modeling by MODELLER. Proteins, 23, 318-326. aSnchez, R. & Sali, A. (a). Evaluation of comparative protein structure modeling by MODELLER-3. Proteins, submitted, . Sankoff, D. & Kruskal, J. B. (1983). Time warps, string edits, and macromolecules: The theory and practice of sequence comparison. Reading, MA: Addison-Wesley Publishing Company. Sellers, P. H. (1974). An algorithm for the distance between two finite sequences. J. Comb. Theor. A16, 253-258. Shanno, D. F. & Phua, K. H. (1980). Minimization of unrestrained multivariate functions. ACM Trans. Math. Soft. 6, 618-622. Shanno, D. F. & Phua, K. H. (1982). Remark on algorithm 500. In: Collected algorithms from ACM. Trans. Math. Software volume 2(1). Sippl, M. J. (1993). Recognition of errors in three-dimensional structures of proteins. Proteins, 17, 355-362. Smith, T. F. & Waterman, M. S. (1981). Identification of common molecular subsequences. J. Mol. Biol. 147, 195-197. Subbiah, S., Laurents, D. V., & Levitt, M. (1993). Structural similarity of DNA-binding domains of bacteriophage repressors and the globin core. Curr. Biol. 3, 141-148. Sutcliffe, M. J., Haneef, I., Carney, D., & Blundell, T. L. (1987). Knowledge based modelling of homologous proteins, Part I: Three dimensional frameworks derived from the simultaneous superposition of multiple structures. Protein Eng. 1, 377-384. van Schaik, R. C., Berendsen, H. J., & Torda, A. E. (1993). A structure refinement method based on molecular dynamics in four spatial dimensions. J.Mol.Biol. 234, 751-762. Verlet, J. (1967). Computer "experiments" on classical fluids: I. Thermodynamical properties of Lennard-Jones molecules. Phys. Rev. 159, 98-103. Index ACCESSIBILITY_TYPE, 47, 75 CAP_ATOM_SHIFT, 92, 94, 125 ADD __PARAMETERS, 28 CHANGE, 45 ADD __RESTRAINT, 76, 81, 82, 106 CHECK_ALIGNMENT, 98 ADD __RESTRAINTS, 75, 80, 84 CHECK_ALIGNMENT, 14, 17, 18, 52, 112 ADD __SEGMENT, 29 CLOSE, 117 ADD __SEQUENCE, 51, 54 CLUSTER_CUT, 37 ADD __SYMMETRY, 78 COLOR_ALN_MODEL, 53 ADD __TOPOLOGY, 28 COMMAND, 118, 120 ALIGN, 14, 56, 58, 60, 62, 63, 66 COMPARE, 58, 64, 65 ALIGN2D, 14, 55, 59, 60 COMPARE_ALIGNMENTS, 52, 53, 55, 59 ALIGN3D, 59, 63, 66, 68 COMPARE_MODE, 65 ALIGN3D_REPEAT, 66 COMPARE_SEQUENCES, 15 ALIGN3D_TRF, 66 CONCATENATE, 116 ALIGN_CONSENSUS, 55 CONDENSE_RESTRAINTS, 81-83, 106 ALIGN_ALIGNMENT, 55, 56 CONSENSUS_ALIGNMENT, 55 ALIGN_BLOCK, 55, 56, 58-60, 62 CONTACT_SHELL, 76, 88, 91-93 ALIGN_CODES, 29, 38, 39, 49, 51, 52, 54-57, 63-65, COULOMB_SWITCH, 88, 91-93 67, 70 CURRENT_DIRECTORY, 67, 68 ALIGN_CONSENSUS, 62 CUT_OVERHANGS, 55, 56 ALIGN_WHAT, 59 alignment, 14, 17, 49 DATA_FILE, 70, 71 ALIGNMENT_FEATURES, 55, 70 DEBUG_FUNCTION, 96 ALIGNMENT_FORMAT, 51, 52, 55, 70 DEBUG_FUNCTION_CUTOFF, 96 ARGUMENTS, 116, 120 DEFINE_INTEGER, 115 ASGL_OUTPUT, 65, 88 DEFINE_LOGICAL, 115 ATOM_FILES, 56 DEFINE_REAL, 115 ATOM_FILES_DIRECTORY, 24, 56 DEFINE_STRING, 115 ATOM_CLASSES_FILE, 28, 29 DEFINE_SYMMETRY, 78 ATOM_FILES, 31, 49, 52, 54-56, 64, 65, 67 DELETE_ALIGNMENT, 71 ATOM_FILES_DIRECTORY, 25, 34, 55, 56, 64, 67, DELETE_RESTRAINT, 82, 106 75 DENDROGRAM, 15, 37, 57, 58, 65 ATOM_IDS, 77, 81, 82 DESCRIBE, 56 ATOM_TYPES, 40, 41 DETAILED_DEBUGGING, 96 DEVIATION, 43, 45, 98 BASIS_PDF_WEIGHT, 77 DIHEDRALS, 45 BASIS_PDF_WEIGHT, 75 DIRECTORY, 24, 25, 28, 51, 52, 57, 58, 60, 62, 69, BASIS_RELATIVE_WEIGHT, 75, 77 70, 84, 87, 95, 116, 120 benchmarks, 20 DISTANCE_ATOMS, 65 bibliography, 2 DISTANCE_RSR_MODEL, 75, 77 BIN __LIB_FILE, 24, 25, 75, 77 DO, 115, 119 BLOCK residues, 27 DYNAMIC_COULOMB, 76, 88, 91-93 breakpoint, 23 DYNAMIC_FLAG, 91, 93, 94 bug reports, 11 DYNAMIC_LENNARD, 76, 88, 91-93 BUILD_METHOD, 36 DYNAMIC_MODELLER, 88, 91, 93 BUILD_MODEL, 27, 29, 36, 38, 98 DYNAMIC_SPHERE, 76, 88, 91-93 CALL, 118 ELSE, 119, 120 139 140 INDEX END __SUBROUTINE, 115, 119 LENNARD_JONES_SWITCH[2], 92 END __DO, 115, 119 LIBRARY_SCHEDULE, 86 END __SUBROUTINE, 119 LOCAL_ALIGNMENT, 59 ENERGY, 15, 29, 72, 78, 88, 90, 93, 96, 99, 112, 124 LOCAL_ALIGNMENT, 59, 60, 62, 66, 67, 70 ENERGY_PROFILE, 29, 72, 90, 93, 124 EQUILIBRATE, 92, 125 MAKE_SCHEDULE, 98 EQUILIBRATION, 94 MAKE_TOPOLOGY_MODEL, 33 evaluation, 20 MAKE_RESTRAINTS, 27, 31, 40, 72, 75, 84, 85, 93, EXPAND_ALIGNMENT, 69 98, 106, 111, 112 EXTEND_HOT_SPOT, 42 MAKE_SCHEDULE, 80, 86, 87 EXTEND_HOT_SPOT, 42 MAKE_TOPOLOGY_MODEL, 28, 33 MALIGN, 14, 59, 62, 68 FAST_SEARCH, 70, 71 MALIGN3D, 14, 37, 59, 65, 67, 68 FAST_SEARCH_CUTOFF, 70, 71 MATRIX_FILE, 57 FILE, 24, 28, 33-35, 47, 51, 52, 55, 58, 70, 84, 87, 95, MATRIX_FILE, 56-58, 65 117, 118, 120 MATRIX_OFFSET, 58-60, 62, 66, 67, 70 file naming, 24 MAX_GAPS_MATCH, 58 file types, 25 MAX_GAPS_MATCH, 57 FILE_ACCESS, 117, 120 MAX_ITERATIONS, 92-94 FILE_EXISTS, 118, 120 MAXIMAL_DISTANCE, 75, 77 FILE_EXT, 24, 69, 116, 120 MD_RETURN, 92, 94 FILE_FORMAT, 84 MD_TIME, 94 FILE_ID, 24, 69, 116, 120 MD_TIME_STEP, 92 FILE_STATUS, 117, 120 MDT_LIB_FILE, 24, 25, 75, 77 FINAL_MALIGN3D, 99 method, 12 FIT, 55, 60, 61, 63, 65-68 MIN_ATOM_SHIFT, 93 FIT __ATOMS, 65-68 MIN_ATOM_SHIFTS, 92 model evaluation, 15 GAP __PENALTIES_1D, 58-60, 62, 70 MODEL2_SEGMENT, 35 GAP __PENALTIES_2D, 60, 61 MODEL_FORMAT, 34, 35 GAP __PENALTIES_3D, 66-68 MODEL_SEGMENT, 34, 50 GENERATE_TOPOLOGY, 76 MODEL_TOPOLOGY, 110 GENERATE_TOPOLOGY, 29-31, 34, 45, 98, 112 modeling primer, 14 GO T___O, 115, 118-120 MODELLER_STATUS, 23, 51, 52, 59, 60, 62, 66, 68, HETATM_IO, 34, 35, 55, 56 70, 92, 94 HYDROGEN_IO, 34, 35, 55, 56 MOLPDF, 88, 92, 94 MUTATE_MODEL, 31, 40 ID1, 24, 69, 116, 120 ID2, 24, 69, 116, 120 N_SCHEDULE, 80, 87 ID T___ABLE, 15, 56, 58 NLOGN_USE, 88, 91, 93, 94 IF, 118, 119 NORMALIZE_PROF, 90 INCLUDE, 25, 118 NUMBER_PLACES, 115, 117, 120 INCLUDE_FILE, 25, 118, 120 INIT_VELOCITIES, 92, 94 OBJECTS, 117, 120 INITIALIZE_XYZ, 36 OBJECTS_FILE, 116, 120 INQUIRE, 118 OFF_DIAGONAL, 58-60, 62, 66, 67, 70 installation, 6 OPEN, 116 INTERSEGMENT, 75 OPERATE, 116 IO U___NIT, 116, 117, 120 OPERATION, 116, 119, 120 IUPAC_MODEL, 44 OPTIMIZATION_METHOD, 80, 91, 92 OPTIMIZE, 29, 42, 76, 89, 91-93, 95, 98, 99, 124 KEEP_RESTRAINTS, 80 ORIENT_MODEL, 46 KEEP_RESTRAINTS, 80 OUTPUT, 47, 65-68, 70, 71, 78, 88, 90, 92 OUTPUT_CONTROL, 24, 120 LABEL, 115, 118 OUTPUT_DIRECTORY, 25, 33, 35, 54-57, 84, 87, LENNARD_JONES_SWITCH, 88, 91, 93 117, 120 INDEX 141 OVERHANG, 55, 56, 58-60, 62, 66, 67, 70 ROOT_NAME, 116 ROOT_NAME, 24, 69, 116, 120 PATCH, 28-30, 71, 111, 112 ROTATE_DIHEDRALS, 40, 45 PATCH_DEFAULT, 29, 30 ROTATE_MODEL, 46 PATCH_DISULFIDES, 31, 98, 106 ROTATION_ANGLE, 46, 47 PICK_ATOMS, 27, 32, 39, 40, 63, 77, 93, 106 ROTATION_AXIS, 46, 47 PICK_ATOMS_SET, 39, 40 ROTATION_MATRIX, 46, 47 PICK_HOT_ATOMS, 29, 42, 73, 93, 124 ROUTINE, 118-120 PICK_HOT_CUTOFF, 42 RR_FILE, 57-60, 62, 69, 70 PICK_RESTRAINTS, 40, 42, 78, 80, 86, 87, 98, 106 PRINCIPAL_COMPONENTS, 57 SCHEDULE_SCALE, 86, 87 PRINCIPAL_COMPONENTS, 15, 58, 65 SCHEDULE_STEP, 80, 88, 91, 92 program distribution, 5 script file, 17, 23 program updates, 22 SEARCH_CHAINS_FILE, 70 SEARCH_CHAINS_FILE, 70, 71 RADII_FACTOR, 75, 88, 91-93 SEARCH_CHAINS_LIST, 70 RAND_SEED, 36, 43, 45, 70, 92, 94 SEARCH_RANDOMIZATIONS, 70, 71 RANDOMIZE, 45 SEARCH_SORT, 70 RANDOMIZE_XYZ, 40, 43, 86, 98 SEARCH_TOP_LIST, 70 READ, 117 SEGMENT_IDS, 39 READ_ALIGNMENT, 98 SELECTION_FROM, 40 READ_ALIGNMENT2, 54 SELECTION_MODE, 40-42 READ_ALIGNMENT, 51-54, 56, 65 SELECTION_SEARCH, 40 READ_ALIGNMENT2, 52, 53 SELECTION_SEGMENT, 40, 50 READ_ATOM_CLASS, 28 SELECTION_SLAB, 40 READ_MODEL, 27, 29, 34, 35, 45, 81, 98 SELECTION_STATUS, 40 READ_MODEL2, 34 SELECTION_STEP, 40 READ_PARAMETERS, 28 SEQUENCE, 29, 45 READ_RESTRAINTS, 84, 85, 106 SEQUENCE_COMPARISON, 58 READ_SCHEDULE, 80, 86, 87 SEQUENCE_COMPARISON, 57 READ_TOPOLOGY, 28, 45 SEQUENCE_SEARCH, 14, 69 RECORD, 117, 120 SEQUENCE_TO_ALI, 32, 54 REINDEX_RESTRAINTS, 83 SET, 24, 30, 115 RELATIVE_DIELECTRIC, 88, 91 SET NLOGN_USE, 94 REMOVE_GAPS, 51, 52, 70 SHEET_H-BONDS, 77 RENAME_SEGMENTS, 39 SMOOTHING_WINDOW, 90 RENUMBER_RESIDUES, 39 SPHERE_CENTER, 40 REORDER2_ATOMS, 45 SPHERE_RADIUS, 40 REORDER_ATOMS, 35, 44, 45 SPHERE_STDV, 88, 91 RES __TYPES, 40, 41 SPLINE_MIN_POINTS, 84 RESET, 116 SPLINE_MIN_POINTS, 84 RESIDUE_SPAN_RANGE, 77 SPLINE_DX, 84 RESIDUE_TYPE, 32 SPLINE_MIN_POINTS, 84 RESIDUE_GROUPING, 75 SPLINE_ON_SITE, 75 RESIDUE_IDS, 30, 75, 77, 111 SPLINE_RANGE, 84 RESIDUE_SPAN_RANGE, 75, 77, 80, 93 SPLINE_RESTRAINTS, 84 RESIDUE_SPAN_SIGN, 77, 80, 93 SPLINE_SELECT, 84 RESIDUE_TYPE, 30, 31 STOP, 115, 120 RESTRAINT_PARAMETERS, 75, 77, 81 STOP_ON_ERROR, 23, 51, 52, 58, 60, 62, 66, 68, 70, RESTRAINT_STDEV, 75, 77 92, 94, 120 RESTRAINT_TYPE, 27, 75 STRING_ARGUMENTS, 116, 119, 120 RESTRAINTS_FILTER, 80, 81 STRING_IF, 119 RESULT, 116, 120 STRING_OPERATE, 116 RETURN, 119 SUBROUTINE, 119 RMS __CUTOFFS, 63 SUPERPOSE, 35, 63, 65 RMS __CUTOFFS, 63, 65 SUPERPOSE_REFINE, 63 142 INDEX SWITCH_TRACE, 92, 95 SYMMETRY_WEIGHT, 78 SYSTEM, 118 TEMPERATURE, 92, 94 templates, 14, 15 THEN, 119, 120 TOPOLOGY_MODEL, 28, 33, 75, 88, 91, 93 TRACE_OUTPUT, 92, 95 TRANSFER_RES_NUMB, 35 TRANSFER_RES_NUMB, 35, 38 TRANSFER_XYZ, 27, 29, 36, 37, 98 TRANSLATION, 46 tutorial, 17 UPDATE_DYNAMIC, 88, 91-94 VARIABILITY_FILE, 57, 58 VARIABLES, 115, 120 VIOL_REPORT_CUT, 42, 88, 89 VIOL_REPORT_CUT2, 88, 89 WATER_IO, 34, 35, 55, 56 WRITE, 117 WRITE_ALIGNMENT, 32, 55, 59, 61, 63, 68 WRITE_ALL_ATOMS, 35 WRITE_DATA, 47 WRITE_FIT, 14, 67, 68 WRITE_MODEL, 35, 98, 99 WRITE_MODEL2, 35 WRITE_RESTRAINTS, 84, 98 WRITE_SCHEDULE, 86, 87 WRITE_TOP, 117 WRITE_TOPOLOGY_MODEL, 33 WRITE_WHOLE_PDB, 67, 68