Difference between revisions of "Perl script for ATM"

From Opengenome.net
 
 
(One intermediate revision by the same user not shown)
Line 1: Line 1:
<pre>
+
#!/usr/bin/perl -w<br /><br />use strict;<br /><br />doMain();<br /><br />sub doMain<br />{<br />    my $baseDirPath = shift @ARGV || &quot;&quot;;<br />    my $pirAlignmentFile = shift @ARGV || &quot;&quot;;<br />    my $targetProteinCode = shift @ARGV || &quot;&quot;;<br />    my $pdbFiles_ref = \@ARGV || ();<br /><br />    print &quot;----\n&quot;;<br />    print &quot;\nChecking command-line parameters ...\n&quot;;<br />    printUsageAndExit() unless ( $baseDirPath &amp;&amp; -d $baseDirPath );<br />    $baseDirPath =~ s/\/$//; # remove the ending slash if exists<br /><br />    printUsageAndExit() unless ( $pirAlignmentFile &amp;&amp; -f &quot;$baseDirPath/$pirAlignmentFile&quot; &amp;&amp; $pirAlignmentFile =~ /\.pir$/ );<br />    printUsageAndExit() unless ( $targetProteinCode &amp;&amp; length($targetProteinCode) == 4 );<br />    printUsageAndExit() unless ( @$pdbFiles_ref &gt; 0 );<br /><br />    foreach my $pdbFile ( @$pdbFiles_ref )<br />    {<br />        printUsageAndExit() unless ( -f &quot;$baseDirPath/$pdbFile&quot; &amp;&amp; $pdbFile =~ /\.pdb$/ );<br />    }<br /><br />    print &quot;Making modeller input files ...\n&quot;;<br />    makeModellerInput( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ); <br />}<br /><br />sub printUsageAndExit<br />{<br />    print STDERR &quot;Error!\n\n&quot;;<br />    print STDERR &quot;Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n&quot;; <br />    print STDERR &quot;   A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n&quot;;<br />    print STDERR &quot;    A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n&quot;;<br />    print STDERR &quot;   A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n&quot;;<br />    print STDERR &quot;   A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n&quot;;<br />    print STDERR &quot;----\n&quot;;<br />    exit;<br />}<br /><br />sub makeModellerInput<br />{<br />    my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_;<br />    print &quot;\tchanging the directory to $baseDirPath ...\n&quot;;<br />    chdir $baseDirPath or die &quot;Dir Change Error, $baseDirPath: $!&quot;;<br /><br />    my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/;<br /><br />    # modeller input files<br />    my $alignmentFile = $outputPrefix. &quot;.ali&quot;;<br />    my $topScriptFile = $outputPrefix. &quot;.top&quot;;<br />    my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, &quot;atm&quot; );<br /><br />    my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, &quot;pdb&quot; );<br />    my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile );<br /><br />    # parse the target protein's alignment and information first<br />    my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} );<br /><br />    # parse alignments and information of known proteins and make atom files of them<br />    foreach my $pdbCode ( keys %pdbCode2PdbFile )<br />    {<br />        my $atomFile = $pdbCode2AtomFile{ $pdbCode };<br />        my $pdbFile = $pdbCode2PdbFile{ $pdbCode };<br />        my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode };<br /><br />        # make the atom files<br />        print &quot;\tmaking the atom file, $atomFile ...\n&quot;;<br />        makeAtomFile( $pdbFile, $atomFile );<br /><br />        $alignmentFileContent .= get_eachAlignmentContent( $pdbCode, $pirAlignment, $pdbFile );<br />    }<br /><br />    # make the alignment file<br />    print &quot;\tmaking the alignment file, $alignmentFile ...\n&quot;;<br />    makeAlignmentFile( $alignmentFile, $alignmentFileContent );<br /><br />    # make the top script file <br />    print &quot;\tmaking the top script file, $topScriptFile ...\n&quot;;<br />    makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile );<br /><br />    print &quot;Finished!\n\n&quot;;<br />    print &quot;----\n&quot;;<br />}<br /><br />sub get_pdbCode2File<br />{<br />    my ( $pdbFiles_ref, $fileExtension ) = @_;<br />    my %pdbCode2File = ();<br /><br />    foreach my $pdbFile ( @$pdbFiles_ref )<br />    {<br />        my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters<br />        my $file = $pdbCode. &quot;.&quot;. $fileExtension;<br />        $pdbCode2File{ $pdbCode } = $file;<br />    }<br /><br />    return %pdbCode2File;<br />}<br /><br />sub get_pdbCode2PirAlignment<br />{<br />    my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_;<br />    my %pdbCode2PirAlignment = ();<br /><br />    my @pdbCodes = ();<br />    push @pdbCodes, $targetProteinCode;<br />    push @pdbCodes, (sort keys %$pdbCode2PdbFile_ref );<br /><br />    $/ = &quot;*\n&quot;; # change the reading-block separator<br />    open FI, $pirAlignmentFile or die &quot;File Open Error, $pirAlignmentFile: $!\n&quot;;<br /><br />    while ( my $entry = <fi> )<br />    {<br />        chomp $entry;<br />        next if $entry =~ /^\s*$/;<br /><br />        my @lines = split /\n/, $entry;<br /><br />        my $pdbCode = &quot;&quot;;<br />        foreach my $code ( @pdbCodes )<br />        {<br />            if ( $lines[0] =~ /$code/ )<br />            {<br />                $pdbCode = $code;<br />                last;<br />            }<br />        }<br /><br />        if ( $pdbCode eq &quot;&quot; )<br />        {<br />            print STDERR &quot;PDB code mistmatch with pdb files on this entry!\n&quot;;<br />            print STDERR &quot;pdbCode: $pdbCode\n&quot;;<br />            print STDERR &quot;entry: $entry\n&quot;;<br />        }<br /><br />        $pdbCode2PirAlignment{ $pdbCode } = &quot;$entry*\n&quot;;<br />        print &quot;\nPDB code To PIR alignment found: $pdbCode =&gt;\n$entry\n&quot;;<br />    }<br /><br />    close FI;<br />    $/ = &quot;\n&quot;; # restore the reading-block separator<br /><br />    return %pdbCode2PirAlignment;<br />}<br /><br />sub getTargetProteinAlignment<br />{<br />    my ( $targetProteinCode, $targetProteinAlignment ) = @_;<br />    my $targetProteinAlignmentContent = &quot;&quot;;<br /><br />    my @lines = split /\n/, $targetProteinAlignment;<br />    shift @lines; # remove header line<br />    shift @lines; # remove blank line<br />    my $sequenceAlignmentOnly = join &quot;\n&quot;, @lines;<br /><br />    $targetProteinAlignmentContent .= &quot;&gt;P1;$targetProteinCode\n&quot;;<br />    $targetProteinAlignmentContent .= &quot;sequence: : : : : : : : :\n&quot;;<br />    $targetProteinAlignmentContent .= &quot;$sequenceAlignmentOnly\n&quot;;<br /><br />    return $targetProteinAlignmentContent;<br />}<br /><br />sub makeAtomFile<br />{<br />    my ( $pdbFile, $atomFile ) = @_;<br />    !system(&quot;cp $pdbFile $atomFile&quot;) or die &quot;File Copy Error: $!&quot;;<br />    print &quot;\tcopying $pdbFile to $atomFile ...\n&quot;;<br />}<br /><br />sub get_eachAlignmentContent<br />{<br />    my ( $pdbCode, $pirAlignment, $pdbFile ) = @_;<br />    my $eachAlignmentContent = &quot;&quot;;<br /><br />    my @lines = split /\n/, $pirAlignment;<br />    shift @lines; # remove header line<br />    shift @lines; # remove blank line<br />    pop @lines; # remove ending *<br />    my $sequenceAlignmentOnly = join &quot;\n&quot;, @lines;<br />    $sequenceAlignmentOnly =~ s/[Xx]/-/g;<br /><br />    my ( $structureInfoLine, $adjustedAlignmentSequence ) <br />        = get_structureInfoAndAdjustedSequence( $pdbCode, $sequenceAlignmentOnly, $pdbFile );<br />    $eachAlignmentContent .= &quot;&gt;P1;$pdbCode\n&quot;;<br />    $eachAlignmentContent .= $structureInfoLine;<br />    $eachAlignmentContent .= &quot;$adjustedAlignmentSequence\n*\n&quot;;<br /><br />    return $eachAlignmentContent;<br />}<br /><br />sub get_structureInfoAndAdjustedSequence<br />{<br />    my ( $pdbCode, $sequenceAlignmentOnly, $pdbFile ) = @_;<br />    my ( $structureInfoLine, $adjustedAlignmentSequence ) = ( &quot;&quot;, &quot;&quot; );<br /><br />    my ( $startResNum, $startChain, $endResNum, $endChain ) = (&quot;&quot;, &quot;&quot;, &quot;&quot;, &quot;&quot;);<br />    my ( $structureXorN, $proteinName, $source, $resolution, $rFactor ) = (&quot;&quot;, &quot;&quot;, &quot;&quot;, &quot;&quot;, &quot;&quot;);<br />    my $allResiduesInPdb = &quot;&quot;;<br />    my $includedResNum = 0;<br />    my $isChainExist = 1; # true<br />        <br /><br />    open FI, $pdbFile or die &quot;File Open Error, $pdbFile: $!\n&quot;;<br />    while ( my $line = <fi> )<br />    {<br />        chomp $line;<br /><br />        if ($line =~ /^COMPND.*MOLECULE:\s*(\S.*\S);\s*$/)<br />        {<br />            $proteinName = $1;<br />        }<br />        elsif ($line =~ /^COMPND.*CHAIN:\s+NULL;\s*$/)<br />        {<br />            $isChainExist = 0; # false<br />        }<br />        elsif ($line =~ /ORGANISM_SCIENTIFIC:\s*(\S.*\S);\s*$/)<br />        {<br />            $source = $1;<br />        }<br />        elsif ($line =~ /^EXPDTA\s+(\S.*\S)\s*$/)<br />        {<br />            my $method = $1;<br />            $structureXorN = <br />                ( $method =~ /X-RAY/ ) ?  &quot;structureX&quot; :<br />                    ( $method =~ /NMR/ ) ? &quot;structureN&quot; :<br />                        &quot;structure&quot;;<br />        }<br />        elsif ($line =~ /^REMARK.*RESOLUTION.\s+([\d\.]+)\s+ANGSTROM.*$/)<br />        {<br />            $resolution = $1;<br />        }<br />        elsif ($line =~ /^REMARK\s+\d+\s+R VALUE\s+\(WORKING SET\)\s*:\s*([\d\.\-]+)\s*$/)<br />        {<br />            $rFactor = $1;<br />        }<br />        elsif ($line =~ /^ATOM/)<br />        {<br />            my @values = split /\s+/, $line;<br />            my $threeLetterAminoAcidCode = $values[3];<br />            my $oneLetterAminoAcidCode = convertThreeToOneLetterAminoAcidCode($threeLetterAminoAcidCode);<br /><br />            if ( $startResNum eq &quot;&quot; &amp;&amp; $isChainExist )<br />            {<br />                ( $startChain, $startResNum ) = ( $values[4], $values[5] );<br />            }<br />            elsif ( $startResNum eq &quot;&quot; )<br />            {<br />                $startResNum = $values[4];<br />            }<br /><br />            if ( $isChainExist )<br />            {<br />                ( $endChain, $endResNum ) = ( $values[4], $values[5] );<br />            }<br />            else<br />            {<br />                $endResNum = $values[4];<br />            }<br /><br />            if ( $endResNum ne $includedResNum )<br />            {<br />                $allResiduesInPdb .= $oneLetterAminoAcidCode;<br />                $includedResNum = $endResNum;<br />            }<br />        }<br />        elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read )<br />        {<br />            last;<br />        }<br />    }<br />    close FI;<br /><br />    $structureInfoLine = &quot;$structureXorN:$pdbCode:$startResNum :$startChain :$endResNum :$endChain &quot;.<br />        &quot;:$proteinName :$source :$resolution :$rFactor \n&quot;;<br />    $adjustedAlignmentSequence = get_adjustedAlignmentSequence( $sequenceAlignmentOnly, $allResiduesInPdb );<br /><br />    return ( $structureInfoLine, $adjustedAlignmentSequence );<br />}<br /><br />sub convertThreeToOneLetterAminoAcidCode<br />{<br />    my ( $threeLetterAminoAcidCode ) = @_;<br />    return<br />        ( $threeLetterAminoAcidCode =~ /gly/i ) ? &quot;G&quot; :<br />        ( $threeLetterAminoAcidCode =~ /pro/i ) ? &quot;P&quot; :<br />        ( $threeLetterAminoAcidCode =~ /ala/i ) ? &quot;A&quot; :<br />        ( $threeLetterAminoAcidCode =~ /val/i ) ? &quot;V&quot; :<br />        ( $threeLetterAminoAcidCode =~ /leu/i ) ? &quot;L&quot; :<br />        ( $threeLetterAminoAcidCode =~ /ile/i ) ? &quot;I&quot; :<br />        ( $threeLetterAminoAcidCode =~ /met/i ) ? &quot;M&quot; :<br />        ( $threeLetterAminoAcidCode =~ /cys/i ) ? &quot;C&quot; :<br />        ( $threeLetterAminoAcidCode =~ /phe/i ) ? &quot;F&quot; :<br />        ( $threeLetterAminoAcidCode =~ /tyr/i ) ? &quot;Y&quot; :<br />        ( $threeLetterAminoAcidCode =~ /trp/i ) ? &quot;W&quot; :<br />        ( $threeLetterAminoAcidCode =~ /his/i ) ? &quot;H&quot; :<br />        ( $threeLetterAminoAcidCode =~ /lys/i ) ? &quot;K&quot; :<br />        ( $threeLetterAminoAcidCode =~ /arg/i ) ? &quot;R&quot; :<br />        ( $threeLetterAminoAcidCode =~ /gln/i ) ? &quot;Q&quot; :<br />        ( $threeLetterAminoAcidCode =~ /asn/i ) ? &quot;N&quot; :<br />        ( $threeLetterAminoAcidCode =~ /glu/i ) ? &quot;E&quot; :<br />        ( $threeLetterAminoAcidCode =~ /asp/i ) ? &quot;D&quot; :<br />        ( $threeLetterAminoAcidCode =~ /ser/i ) ? &quot;S&quot; :<br />        ( $threeLetterAminoAcidCode =~ /thr/i ) ? &quot;T&quot; :<br />        &quot;-&quot;;<br />}<br /><br />sub get_adjustedAlignmentSequence<br />{<br />    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;<br />    my $adjustedAlignmentSequence = &quot;&quot;;<br /><br />    my @lines = split /\n/, $sequenceAlignmentOnly;<br />    my $charactersPerLine = length $lines[0];<br />    $sequenceAlignmentOnly =~ s/\n//g; # remove new line characters<br /><br />    # substitute front and end residues without atom coordinates to '-'s <br />    my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb ); <br />    my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb ); <br />    $adjustedAlignmentSequence = &quot;-&quot; x ($startPos - 1).<br />        substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based<br />        &quot;-&quot; x ( length($sequenceAlignmentOnly) - $endPos );<br /><br />    # substitute loop residues without atom coordinates to '-'s <br />    my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb );<br />    foreach my $loopPosition ( @loopPositions )<br />    {<br />        my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56<br />        $adjustedAlignmentSequence = <br />            substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ).<br />            &quot;-&quot; x ( $loopEndPos - $loopStartPos + 1).<br />            substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos );<br />    }<br /><br />    my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence;<br />    $dashRemovedAdjustedAlignmentSequence =~ s/-//g;<br /><br />    if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb )<br />    {<br />        print STDERR &quot;\nError!!!\n&quot;;<br />        print STDERR &quot;Number of residues in the PIR-alignment file and the PDB file mismatches !\n&quot;;<br />        print STDERR &quot;Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n&quot;;<br />        print STDERR &quot;dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n&quot;;<br />        print STDERR &quot;allResiduesInPdb: $allResiduesInPdb\n&quot;;<br />    }<br /><br />    return formatFastaSequence($adjustedAlignmentSequence, $charactersPerLine);<br />}<br /><br />sub get_startPos<br />{<br />    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;<br /><br />    # Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment<br />    #            matching with the residues in the PDB file  <br />    my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 );<br /><br />    my $startPos = &quot;&quot;;<br />    my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;<br />    my $pos = 0;<br /><br />    foreach my $aa ( @allAminoAcidsInTheAlignmentSequence )<br />    {<br />        $pos++;<br />        next if $aa eq &quot;-&quot;;<br /><br />        my $frontRemovedSeq = substr( $sequenceAlignmentOnly, $pos - 1, length($sequenceAlignmentOnly) - $pos + 1 );<br />        $frontRemovedSeq =~ s/-//g; # remove all dashes<br /><br />        if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i )<br />        {<br />            $startPos = $pos;<br />            print &quot;\tstart position or non-loop start position determined: $pos\n&quot;;<br />            last;<br />        }<br />    }<br /><br />    if ( $startPos eq &quot;&quot; )<br />    {<br />        print STDERR &quot;\nError!\n&quot;;<br />        print STDERR &quot;Start position mismatches!\n&quot;;<br />        print STDERR &quot;Check the start amino acids of the alignment!\n\n&quot;;<br />        print STDERR &quot;sequenceAlignment: $sequenceAlignmentOnly\n&quot;;<br />        print STDERR &quot;residuesInPdb: $allResiduesInPdb\n\n&quot;;<br />    }<br /><br />    return $startPos;<br />}<br /><br />sub get_endPos<br />{<br />    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;<br /><br />    # Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment<br />    #            matching with the residues in the PDB file  <br />    my $reversedLastTenAminoAcidsOfAllResiduesInPdb = <br />        reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 ));<br /><br />    my $endPos = &quot;&quot;;<br />    my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;<br />    my $pos = @allAminoAcidsInTheAlignmentSequence + 1;<br /><br />    foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence )<br />    {<br />        $pos--;<br />        next if $aa eq &quot;-&quot;;<br /><br />        my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) );<br />        $endRemovedReverseSeq =~ s/-//g; # remove all dashes<br /><br />        if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i )<br />        {<br />            $endPos = $pos;<br />            print &quot;\tend position determined: $pos\n&quot;;<br />            last;<br />        }<br />    }<br /><br />    if ( $endPos eq &quot;&quot; )<br />    {<br />        print STDERR &quot;\nError!\n&quot;;<br />        print STDERR &quot;End position mismatches between the alignment file and the pdb file!\n&quot;;<br />        print STDERR &quot;Check the end amino acids of the alignment!\n\n&quot;;<br />        print STDERR &quot;sequenceAlignmentOnly: $sequenceAlignmentOnly\n&quot;;<br />        print STDERR &quot;allResiduesInPdb: $allResiduesInPdb\n\n&quot;;<br />    }<br /><br />    return $endPos;<br />}<br /><br />sub reverseMyString<br />{<br />    my ( $string ) = @_;<br />    my @allChars = split //, $string;<br /><br />    return join(&quot;&quot;, reverse @allChars );<br />}<br /><br />sub get_loopPositions<br />{<br />    my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_;<br />    my @loopPositions = ();<br /><br />    my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence;<br />    $dashRemovedAdjustedSequence =~ s/-//g;<br /><br />    if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb )<br />    {<br />        return @loopPositions;<br />    }<br /><br />    my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence;<br />    my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb;<br />    my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 );<br /><br />    while ( $alignmentResiduePos &lt;= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number<br />    {<br />        $alignmentResiduePos++;<br />        next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq &quot;-&quot;;<br />        $pdbFileResiduePos++;<br /><br />        if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne <br />                $allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] )<br />        {<br />            my $frontRemovedAlignmentSeq = <br />                substr( $adjustedAlignmentSequence, <br />                        $alignmentResiduePos - 1, <br />                        length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 );<br />            my $frontRemovedPdbResidueSeq = <br />                substr( $allResiduesInPdb,<br />                        $pdbFileResiduePos - 1,<br />                        length($allResiduesInPdb) - $pdbFileResiduePos + 1 );<br /><br />            my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1;<br />            my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1);<br /><br />            if ( $loopLength ne &quot;&quot; &amp;&amp; $loopStartPos &lt;= $loopEndPos )<br />            {<br />                push @loopPositions, &quot;$loopStartPos-$loopEndPos&quot;;<br />                print &quot;\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n&quot;;<br /><br />                $alignmentResiduePos += $loopLength - 1;<br />                $pdbFileResiduePos--;<br />            }<br />        }<br />    }<br /><br />    return @loopPositions;<br />}<br /><br />sub formatFastaSequence<br />{<br />    my ($seq, $charactersPerLine) = @_;<br />    $seq =~ s/(.{$charactersPerLine})/$1\n/g;<br />    return $seq;<br />}<br /><br />sub makeAlignmentFile<br />{<br />    my ( $alignmentFile, $alignmentFileContent ) = @_;<br /><br />    open FO, &quot;&gt;$alignmentFile&quot; or die &quot;File Write Error, $alignmentFile: $!&quot;;<br />    print FO $alignmentFileContent;<br />    close FO;<br />}<br /><br />sub makeTopScriptFile<br />{<br />    my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_;<br /><br />    open FO, &quot;&gt;$topScriptFile&quot; or die &quot;File Write Error, $topScriptFile: $!&quot;;<br />    print FO printTopScriptInvariableContentFront();<br /><br />    # print variable contents of the top script file<br />    print FO &quot;SET ALNFILE  = '$alignmentFile'      # alignment filename\n&quot;;<br />    print FO &quot;SET KNOWNS  = &quot;;<br />    foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref )<br />    {<br />        print FO &quot;'$pdbCode' &quot;;<br />    }<br />    print FO &quot;               # codes of the templates\n&quot;;<br />    print FO &quot;SET SEQUENCE = '$targetProteinCode'              # code of the target\n&quot;;<br /><br />    print FO printTopScriptInvariableContentEnd();<br />    close FO;<br />}<br /><br />sub printTopScriptInvariableContentFront<br />{<br />    return &quot;&quot;.<br />        &quot;# Homology modelling by the MODELLER TOP routine 'model'.\n&quot;.<br />        &quot;\n&quot;.<br />        &quot;INCLUDE                            # Include the predefined TOP routines\n&quot;.<br />        &quot;\n&quot;.<br />        &quot;SET OUTPUT_CONTROL = 1 1 1 1 1      # uncomment to produce a large log file\n&quot;.<br />        &quot;\n&quot;;<br />}<br /><br />sub printTopScriptInvariableContentEnd<br />{<br />    return &quot;&quot;.<br />        &quot;SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n&quot;.<br />        &quot;SET STARTING_MODEL= 1              # index of the first model\n&quot;.<br />        &quot;SET ENDING_MODEL  = 1              # index of the last model\n&quot;.<br />        &quot;                                   # (determines how many models to calculate)\n&quot;.<br />        &quot;CALL ROUTINE = 'model'              # do homology modelling\n&quot;.<br />        &quot;\n&quot;;<br />}<br /><br /></fi></fi>
#!/usr/bin/perl -w
 
 
 
use strict;
 
 
 
doMain();
 
 
 
sub doMain
 
{
 
    my $baseDirPath = shift @ARGV || "";
 
    my $pirAlignmentFile = shift @ARGV || "";
 
    my $targetProteinCode = shift @ARGV || "";
 
    my $pdbFiles_ref = \@ARGV || ();
 
 
 
    print "----\n";
 
    print "\nChecking command-line parameters ...\n";
 
    printUsageAndExit() unless ( $baseDirPath && -d $baseDirPath );
 
    $baseDirPath =~ s/\/$//; # remove the ending slash if exists
 
 
 
    printUsageAndExit() unless ( $pirAlignmentFile && -f "$baseDirPath/$pirAlignmentFile" && $pirAlignmentFile =~ /\.pir$/ );
 
    printUsageAndExit() unless ( $targetProteinCode && length($targetProteinCode) == 4 );
 
    printUsageAndExit() unless ( @$pdbFiles_ref > 0 );
 
 
 
    foreach my $pdbFile ( @$pdbFiles_ref )
 
    {
 
        printUsageAndExit() unless ( -f "$baseDirPath/$pdbFile" && $pdbFile =~ /\.pdb$/ );
 
    }
 
 
 
    print "Making modeller input files ...\n";
 
    makeModellerInput( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref );  
 
}
 
 
 
sub printUsageAndExit
 
{
 
    print STDERR "Error!\n\n";
 
    print STDERR "Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n";  
 
    print STDERR "   A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n";
 
    print STDERR "   A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n";
 
    print STDERR "   A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n";
 
    print STDERR "   A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n";
 
    print STDERR "----\n";
 
    exit;
 
}
 
 
 
sub makeModellerInput
 
{
 
    my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_;
 
    print "\tchanging the directory to $baseDirPath ...\n";
 
    chdir $baseDirPath or die "Dir Change Error, $baseDirPath: $!";
 
 
 
    my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/;
 
 
 
    # modeller input files
 
    my $alignmentFile = $outputPrefix. ".ali";
 
    my $topScriptFile = $outputPrefix. ".top";
 
    my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, "atm" );
 
 
 
    my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, "pdb" );
 
    my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile );
 
 
 
    # parse the target protein's alignment and information first
 
    my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} );
 
 
 
    # parse alignments and information of known proteins and make atom files of them
 
    foreach my $pdbCode ( keys %pdbCode2PdbFile )
 
    {
 
        my $atomFile = $pdbCode2AtomFile{ $pdbCode };
 
        my $pdbFile = $pdbCode2PdbFile{ $pdbCode };
 
        my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode };
 
 
 
        # make the atom files
 
        print "\tmaking the atom file, $atomFile ...\n";
 
        makeAtomFile( $pdbFile, $atomFile );
 
 
 
        $alignmentFileContent .= get_eachAlignmentContent( $pdbCode, $pirAlignment, $pdbFile );
 
    }
 
 
 
    # make the alignment file
 
    print "\tmaking the alignment file, $alignmentFile ...\n";
 
    makeAlignmentFile( $alignmentFile, $alignmentFileContent );
 
 
 
    # make the top script file  
 
    print "\tmaking the top script file, $topScriptFile ...\n";
 
    makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile );
 
 
 
    print "Finished!\n\n";
 
    print "----\n";
 
}
 
 
 
sub get_pdbCode2File
 
{
 
    my ( $pdbFiles_ref, $fileExtension ) = @_;
 
    my %pdbCode2File = ();
 
 
 
    foreach my $pdbFile ( @$pdbFiles_ref )
 
    {
 
        my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters
 
        my $file = $pdbCode. ".". $fileExtension;
 
        $pdbCode2File{ $pdbCode } = $file;
 
    }
 
 
 
    return %pdbCode2File;
 
}
 
 
 
sub get_pdbCode2PirAlignment
 
{
 
    my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_;
 
    my %pdbCode2PirAlignment = ();
 
 
 
    my @pdbCodes = ();
 
    push @pdbCodes, $targetProteinCode;
 
    push @pdbCodes, (sort keys %$pdbCode2PdbFile_ref );
 
 
 
    $/ = "*\n"; # change the reading-block separator
 
    open FI, $pirAlignmentFile or die "File Open Error, $pirAlignmentFile: $!\n";
 
 
 
    while ( my $entry = <FI> )
 
    {
 
        chomp $entry;
 
        next if $entry =~ /^\s*$/;
 
 
 
        my @lines = split /\n/, $entry;
 
 
 
        my $pdbCode = "";
 
        foreach my $code ( @pdbCodes )
 
        {
 
            if ( $lines[0] =~ /$code/ )
 
            {
 
                $pdbCode = $code;
 
                last;
 
            }
 
        }
 
 
 
        if ( $pdbCode eq "" )
 
        {
 
            print STDERR "PDB code mistmatch with pdb files on this entry!\n";
 
            print STDERR "pdbCode: $pdbCode\n";
 
            print STDERR "entry: $entry\n";
 
        }
 
 
 
        $pdbCode2PirAlignment{ $pdbCode } = "$entry*\n";
 
        print "\nPDB code To PIR alignment found: $pdbCode =>\n$entry\n";
 
    }
 
 
 
    close FI;
 
    $/ = "\n"; # restore the reading-block separator
 
 
 
    return %pdbCode2PirAlignment;
 
}
 
 
 
sub getTargetProteinAlignment
 
{
 
    my ( $targetProteinCode, $targetProteinAlignment ) = @_;
 
    my $targetProteinAlignmentContent = "";
 
 
 
    my @lines = split /\n/, $targetProteinAlignment;
 
    shift @lines; # remove header line
 
    shift @lines; # remove blank line
 
    my $sequenceAlignmentOnly = join "\n", @lines;
 
 
 
    $targetProteinAlignmentContent .= ">P1;$targetProteinCode\n";
 
    $targetProteinAlignmentContent .= "sequence: : : : : : : : :\n";
 
    $targetProteinAlignmentContent .= "$sequenceAlignmentOnly\n";
 
 
 
    return $targetProteinAlignmentContent;
 
}
 
 
 
sub makeAtomFile
 
{
 
    my ( $pdbFile, $atomFile ) = @_;
 
    !system("cp $pdbFile $atomFile") or die "File Copy Error: $!";
 
    print "\tcopying $pdbFile to $atomFile ...\n";
 
}
 
 
 
sub get_eachAlignmentContent
 
{
 
    my ( $pdbCode, $pirAlignment, $pdbFile ) = @_;
 
    my $eachAlignmentContent = "";
 
 
 
    my @lines = split /\n/, $pirAlignment;
 
    shift @lines; # remove header line
 
    shift @lines; # remove blank line
 
    pop @lines; # remove ending *
 
    my $sequenceAlignmentOnly = join "\n", @lines;
 
    $sequenceAlignmentOnly =~ s/[Xx]/-/g;
 
 
 
    my ( $structureInfoLine, $adjustedAlignmentSequence )  
 
        = get_structureInfoAndAdjustedSequence( $pdbCode, $sequenceAlignmentOnly, $pdbFile );
 
    $eachAlignmentContent .= ">P1;$pdbCode\n";
 
    $eachAlignmentContent .= $structureInfoLine;
 
    $eachAlignmentContent .= "$adjustedAlignmentSequence\n*\n";
 
 
 
    return $eachAlignmentContent;
 
}
 
 
 
sub get_structureInfoAndAdjustedSequence
 
{
 
    my ( $pdbCode, $sequenceAlignmentOnly, $pdbFile ) = @_;
 
    my ( $structureInfoLine, $adjustedAlignmentSequence ) = ( "", "" );
 
 
 
    my ( $startResNum, $startChain, $endResNum, $endChain ) = ("", "", "", "");
 
    my ( $structureXorN, $proteinName, $source, $resolution, $rFactor ) = ("", "", "", "", "");
 
    my $allResiduesInPdb = "";
 
    my $includedResNum = 0;
 
    my $isChainExist = 1; # true
 
       
 
 
 
    open FI, $pdbFile or die "File Open Error, $pdbFile: $!\n";
 
    while ( my $line = <FI> )
 
    {
 
        chomp $line;
 
 
 
        if ($line =~ /^COMPND.*MOLECULE:\s*(\S.*\S);\s*$/)
 
        {
 
            $proteinName = $1;
 
        }
 
        elsif ($line =~ /^COMPND.*CHAIN:\s+NULL;\s*$/)
 
        {
 
            $isChainExist = 0; # false
 
        }
 
        elsif ($line =~ /ORGANISM_SCIENTIFIC:\s*(\S.*\S);\s*$/)
 
        {
 
            $source = $1;
 
        }
 
        elsif ($line =~ /^EXPDTA\s+(\S.*\S)\s*$/)
 
        {
 
            my $method = $1;
 
            $structureXorN =  
 
                ( $method =~ /X-RAY/ ) ?  "structureX" :
 
                    ( $method =~ /NMR/ ) ? "structureN" :
 
                        "structure";
 
        }
 
        elsif ($line =~ /^REMARK.*RESOLUTION.\s+([\d\.]+)\s+ANGSTROM.*$/)
 
        {
 
            $resolution = $1;
 
        }
 
        elsif ($line =~ /^REMARK\s+\d+\s+R VALUE\s+\(WORKING SET\)\s*:\s*([\d\.\-]+)\s*$/)
 
        {
 
            $rFactor = $1;
 
        }
 
        elsif ($line =~ /^ATOM/)
 
        {
 
            my @values = split /\s+/, $line;
 
            my $threeLetterAminoAcidCode = $values[3];
 
            my $oneLetterAminoAcidCode = convertThreeToOneLetterAminoAcidCode($threeLetterAminoAcidCode);
 
 
 
            if ( $startResNum eq "" && $isChainExist )
 
            {
 
                ( $startChain, $startResNum ) = ( $values[4], $values[5] );
 
            }
 
            elsif ( $startResNum eq "" )
 
            {
 
                $startResNum = $values[4];
 
            }
 
 
 
            if ( $isChainExist )
 
            {
 
                ( $endChain, $endResNum ) = ( $values[4], $values[5] );
 
            }
 
            else
 
            {
 
                $endResNum = $values[4];
 
            }
 
 
 
            if ( $endResNum ne $includedResNum )
 
            {
 
                $allResiduesInPdb .= $oneLetterAminoAcidCode;
 
                $includedResNum = $endResNum;
 
            }
 
        }
 
        elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read )
 
        {
 
            last;
 
        }
 
    }
 
    close FI;
 
 
 
    $structureInfoLine = "$structureXorN:$pdbCode:$startResNum :$startChain :$endResNum :$endChain ".
 
        ":$proteinName :$source :$resolution :$rFactor \n";
 
    $adjustedAlignmentSequence = get_adjustedAlignmentSequence( $sequenceAlignmentOnly, $allResiduesInPdb );
 
 
 
    return ( $structureInfoLine, $adjustedAlignmentSequence );
 
}
 
 
 
sub convertThreeToOneLetterAminoAcidCode
 
{
 
    my ( $threeLetterAminoAcidCode ) = @_;
 
    return
 
        ( $threeLetterAminoAcidCode =~ /gly/i ) ? "G" :
 
        ( $threeLetterAminoAcidCode =~ /pro/i ) ? "P" :
 
        ( $threeLetterAminoAcidCode =~ /ala/i ) ? "A" :
 
        ( $threeLetterAminoAcidCode =~ /val/i ) ? "V" :
 
        ( $threeLetterAminoAcidCode =~ /leu/i ) ? "L" :
 
        ( $threeLetterAminoAcidCode =~ /ile/i ) ? "I" :
 
        ( $threeLetterAminoAcidCode =~ /met/i ) ? "M" :
 
        ( $threeLetterAminoAcidCode =~ /cys/i ) ? "C" :
 
        ( $threeLetterAminoAcidCode =~ /phe/i ) ? "F" :
 
        ( $threeLetterAminoAcidCode =~ /tyr/i ) ? "Y" :
 
        ( $threeLetterAminoAcidCode =~ /trp/i ) ? "W" :
 
        ( $threeLetterAminoAcidCode =~ /his/i ) ? "H" :
 
        ( $threeLetterAminoAcidCode =~ /lys/i ) ? "K" :
 
        ( $threeLetterAminoAcidCode =~ /arg/i ) ? "R" :
 
        ( $threeLetterAminoAcidCode =~ /gln/i ) ? "Q" :
 
        ( $threeLetterAminoAcidCode =~ /asn/i ) ? "N" :
 
        ( $threeLetterAminoAcidCode =~ /glu/i ) ? "E" :
 
        ( $threeLetterAminoAcidCode =~ /asp/i ) ? "D" :
 
        ( $threeLetterAminoAcidCode =~ /ser/i ) ? "S" :
 
        ( $threeLetterAminoAcidCode =~ /thr/i ) ? "T" :
 
        "-";
 
}
 
 
 
sub get_adjustedAlignmentSequence
 
{
 
    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
 
    my $adjustedAlignmentSequence = "";
 
 
 
    my @lines = split /\n/, $sequenceAlignmentOnly;
 
    my $charactersPerLine = length $lines[0];
 
    $sequenceAlignmentOnly =~ s/\n//g; # remove new line characters
 
 
 
    # substitute front and end residues without atom coordinates to '-'s  
 
    my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb );  
 
    my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb );  
 
    $adjustedAlignmentSequence = "-" x ($startPos - 1).
 
        substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based
 
        "-" x ( length($sequenceAlignmentOnly) - $endPos );
 
 
 
    # substitute loop residues without atom coordinates to '-'s  
 
    my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb );
 
    foreach my $loopPosition ( @loopPositions )
 
    {
 
        my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56
 
        $adjustedAlignmentSequence =  
 
            substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ).
 
            "-" x ( $loopEndPos - $loopStartPos + 1).
 
            substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos );
 
    }
 
 
 
    my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence;
 
    $dashRemovedAdjustedAlignmentSequence =~ s/-//g;
 
 
 
    if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb )
 
    {
 
        print STDERR "\nError!!!\n";
 
        print STDERR "Number of residues in the PIR-alignment file and the PDB file mismatches !\n";
 
        print STDERR "Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n";
 
        print STDERR "dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n";
 
        print STDERR "allResiduesInPdb: $allResiduesInPdb\n";
 
    }
 
 
 
    return formatFastaSequence($adjustedAlignmentSequence, $charactersPerLine);
 
}
 
 
 
sub get_startPos
 
{
 
    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
 
 
 
    # Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment
 
    #            matching with the residues in the PDB file   
 
    my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 );
 
 
 
    my $startPos = "";
 
    my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;
 
    my $pos = 0;
 
 
 
    foreach my $aa ( @allAminoAcidsInTheAlignmentSequence )
 
    {
 
        $pos++;
 
        next if $aa eq "-";
 
 
 
        my $frontRemovedSeq = substr( $sequenceAlignmentOnly, $pos - 1, length($sequenceAlignmentOnly) - $pos + 1 );
 
        $frontRemovedSeq =~ s/-//g; # remove all dashes
 
 
 
        if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i )
 
        {
 
            $startPos = $pos;
 
            print "\tstart position or non-loop start position determined: $pos\n";
 
            last;
 
        }
 
    }
 
 
 
    if ( $startPos eq "" )
 
    {
 
        print STDERR "\nError!\n";
 
        print STDERR "Start position mismatches!\n";
 
        print STDERR "Check the start amino acids of the alignment!\n\n";
 
        print STDERR "sequenceAlignment: $sequenceAlignmentOnly\n";
 
        print STDERR "residuesInPdb: $allResiduesInPdb\n\n";
 
    }
 
 
 
    return $startPos;
 
}
 
 
 
sub get_endPos
 
{
 
    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
 
 
 
    # Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment
 
    #            matching with the residues in the PDB file   
 
    my $reversedLastTenAminoAcidsOfAllResiduesInPdb =  
 
        reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 ));
 
 
 
    my $endPos = "";
 
    my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;
 
    my $pos = @allAminoAcidsInTheAlignmentSequence + 1;
 
 
 
    foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence )
 
    {
 
        $pos--;
 
        next if $aa eq "-";
 
 
 
        my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) );
 
        $endRemovedReverseSeq =~ s/-//g; # remove all dashes
 
 
 
        if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i )
 
        {
 
            $endPos = $pos;
 
            print "\tend position determined: $pos\n";
 
            last;
 
        }
 
    }
 
 
 
    if ( $endPos eq "" )
 
    {
 
        print STDERR "\nError!\n";
 
        print STDERR "End position mismatches between the alignment file and the pdb file!\n";
 
        print STDERR "Check the end amino acids of the alignment!\n\n";
 
        print STDERR "sequenceAlignmentOnly: $sequenceAlignmentOnly\n";
 
        print STDERR "allResiduesInPdb: $allResiduesInPdb\n\n";
 
    }
 
 
 
    return $endPos;
 
}
 
 
 
sub reverseMyString
 
{
 
    my ( $string ) = @_;
 
    my @allChars = split //, $string;
 
 
 
    return join("", reverse @allChars );
 
}
 
 
 
sub get_loopPositions
 
{
 
    my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_;
 
    my @loopPositions = ();
 
 
 
    my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence;
 
    $dashRemovedAdjustedSequence =~ s/-//g;
 
 
 
    if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb )
 
    {
 
        return @loopPositions;
 
    }
 
 
 
    my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence;
 
    my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb;
 
    my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 );
 
 
 
    while ( $alignmentResiduePos <= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number
 
    {
 
        $alignmentResiduePos++;
 
        next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq "-";
 
        $pdbFileResiduePos++;
 
 
 
        if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne  
 
                $allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] )
 
        {
 
            my $frontRemovedAlignmentSeq =  
 
                substr( $adjustedAlignmentSequence,  
 
                        $alignmentResiduePos - 1,  
 
                        length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 );
 
            my $frontRemovedPdbResidueSeq =  
 
                substr( $allResiduesInPdb,
 
                        $pdbFileResiduePos - 1,
 
                        length($allResiduesInPdb) - $pdbFileResiduePos + 1 );
 
 
 
            my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1;
 
            my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1);
 
 
 
            if ( $loopLength ne "" && $loopStartPos <= $loopEndPos )
 
            {
 
                push @loopPositions, "$loopStartPos-$loopEndPos";
 
                print "\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n";
 
 
 
                $alignmentResiduePos += $loopLength - 1;
 
                $pdbFileResiduePos--;
 
            }
 
        }
 
    }
 
 
 
    return @loopPositions;
 
}
 
 
 
sub formatFastaSequence
 
{
 
    my ($seq, $charactersPerLine) = @_;
 
    $seq =~ s/(.{$charactersPerLine})/$1\n/g;
 
    return $seq;
 
}
 
 
 
sub makeAlignmentFile
 
{
 
    my ( $alignmentFile, $alignmentFileContent ) = @_;
 
 
 
    open FO, ">$alignmentFile" or die "File Write Error, $alignmentFile: $!";
 
    print FO $alignmentFileContent;
 
    close FO;
 
}
 
 
 
sub makeTopScriptFile
 
{
 
    my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_;
 
 
 
    open FO, ">$topScriptFile" or die "File Write Error, $topScriptFile: $!";
 
    print FO printTopScriptInvariableContentFront();
 
 
 
    # print variable contents of the top script file
 
    print FO "SET ALNFILE  = '$alignmentFile'      # alignment filename\n";
 
    print FO "SET KNOWNS  = ";
 
    foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref )
 
    {
 
        print FO "'$pdbCode' ";
 
    }
 
    print FO "               # codes of the templates\n";
 
    print FO "SET SEQUENCE = '$targetProteinCode'              # code of the target\n";
 
 
 
    print FO printTopScriptInvariableContentEnd();
 
    close FO;
 
}
 
 
 
sub printTopScriptInvariableContentFront
 
{
 
    return "".
 
        "# Homology modelling by the MODELLER TOP routine 'model'.\n".
 
        "\n".
 
        "INCLUDE                            # Include the predefined TOP routines\n".
 
        "\n".
 
        "SET OUTPUT_CONTROL = 1 1 1 1 1      # uncomment to produce a large log file\n".
 
        "\n";
 
}
 
 
 
sub printTopScriptInvariableContentEnd
 
{
 
    return "".
 
        "SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n".
 
        "SET STARTING_MODEL= 1              # index of the first model\n".
 
        "SET ENDING_MODEL  = 1              # index of the last model\n".
 
        "                                   # (determines how many models to calculate)\n".
 
        "CALL ROUTINE = 'model'              # do homology modelling\n".
 
        "\n";
 
}
 
 
 
</pre>
 

Latest revision as of 17:52, 12 May 2006

  1. !/usr/bin/perl -w

    use strict;

    doMain();

    sub doMain
    {
    my $baseDirPath = shift @ARGV || "";
    my $pirAlignmentFile = shift @ARGV || "";
    my $targetProteinCode = shift @ARGV || "";
    my $pdbFiles_ref = \@ARGV || ();

    print "----\n";
    print "\nChecking command-line parameters ...\n";
    printUsageAndExit() unless ( $baseDirPath && -d $baseDirPath );
    $baseDirPath =~ s/\/$//; # remove the ending slash if exists

    printUsageAndExit() unless ( $pirAlignmentFile && -f "$baseDirPath/$pirAlignmentFile" && $pirAlignmentFile =~ /\.pir$/ );
    printUsageAndExit() unless ( $targetProteinCode && length($targetProteinCode) == 4 );
    printUsageAndExit() unless ( @$pdbFiles_ref > 0 );

    foreach my $pdbFile ( @$pdbFiles_ref )
    {
    printUsageAndExit() unless ( -f "$baseDirPath/$pdbFile" && $pdbFile =~ /\.pdb$/ );
    }

    print "Making modeller input files ...\n";
    makeModellerInput( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref );
    }

    sub printUsageAndExit
    {
    print STDERR "Error!\n\n";
    print STDERR "Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n";
    print STDERR " A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n";
    print STDERR " A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n";
    print STDERR " A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n";
    print STDERR " A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n";
    print STDERR "----\n";
    exit;
    }

    sub makeModellerInput
    {
    my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_;
    print "\tchanging the directory to $baseDirPath ...\n";
    chdir $baseDirPath or die "Dir Change Error, $baseDirPath: $!";

    my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/;

    # modeller input files
    my $alignmentFile = $outputPrefix. ".ali";
    my $topScriptFile = $outputPrefix. ".top";
    my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, "atm" );

    my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, "pdb" );
    my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile );

    # parse the target protein's alignment and information first
    my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} );

    # parse alignments and information of known proteins and make atom files of them
    foreach my $pdbCode ( keys %pdbCode2PdbFile )
    {
    my $atomFile = $pdbCode2AtomFile{ $pdbCode };
    my $pdbFile = $pdbCode2PdbFile{ $pdbCode };
    my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode };

    # make the atom files
    print "\tmaking the atom file, $atomFile ...\n";
    makeAtomFile( $pdbFile, $atomFile );

    $alignmentFileContent .= get_eachAlignmentContent( $pdbCode, $pirAlignment, $pdbFile );
    }

    # make the alignment file
    print "\tmaking the alignment file, $alignmentFile ...\n";
    makeAlignmentFile( $alignmentFile, $alignmentFileContent );

    # make the top script file
    print "\tmaking the top script file, $topScriptFile ...\n";
    makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile );

    print "Finished!\n\n";
    print "----\n";
    }

    sub get_pdbCode2File
    {
    my ( $pdbFiles_ref, $fileExtension ) = @_;
    my %pdbCode2File = ();

    foreach my $pdbFile ( @$pdbFiles_ref )
    {
    my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters
    my $file = $pdbCode. ".". $fileExtension;
    $pdbCode2File{ $pdbCode } = $file;
    }

    return %pdbCode2File;
    }

    sub get_pdbCode2PirAlignment
    {
    my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_;
    my %pdbCode2PirAlignment = ();

    my @pdbCodes = ();
    push @pdbCodes, $targetProteinCode;
    push @pdbCodes, (sort keys %$pdbCode2PdbFile_ref );

    $/ = "*\n"; # change the reading-block separator
    open FI, $pirAlignmentFile or die "File Open Error, $pirAlignmentFile: $!\n";

    while ( my $entry = <fi> )
    {
    chomp $entry;
    next if $entry =~ /^\s*$/;

    my @lines = split /\n/, $entry;

    my $pdbCode = "";
    foreach my $code ( @pdbCodes )
    {
    if ( $lines[0] =~ /$code/ )
    {
    $pdbCode = $code;
    last;
    }
    }

    if ( $pdbCode eq "" )
    {
    print STDERR "PDB code mistmatch with pdb files on this entry!\n";
    print STDERR "pdbCode: $pdbCode\n";
    print STDERR "entry: $entry\n";
    }

    $pdbCode2PirAlignment{ $pdbCode } = "$entry*\n";
    print "\nPDB code To PIR alignment found: $pdbCode =>\n$entry\n";
    }

    close FI;
    $/ = "\n"; # restore the reading-block separator

    return %pdbCode2PirAlignment;
    }

    sub getTargetProteinAlignment
    {
    my ( $targetProteinCode, $targetProteinAlignment ) = @_;
    my $targetProteinAlignmentContent = "";

    my @lines = split /\n/, $targetProteinAlignment;
    shift @lines; # remove header line
    shift @lines; # remove blank line
    my $sequenceAlignmentOnly = join "\n", @lines;

    $targetProteinAlignmentContent .= ">P1;$targetProteinCode\n";
    $targetProteinAlignmentContent .= "sequence: : : : : : : : :\n";
    $targetProteinAlignmentContent .= "$sequenceAlignmentOnly\n";

    return $targetProteinAlignmentContent;
    }

    sub makeAtomFile
    {
    my ( $pdbFile, $atomFile ) = @_;
     !system("cp $pdbFile $atomFile") or die "File Copy Error: $!";
    print "\tcopying $pdbFile to $atomFile ...\n";
    }

    sub get_eachAlignmentContent
    {
    my ( $pdbCode, $pirAlignment, $pdbFile ) = @_;
    my $eachAlignmentContent = "";

    my @lines = split /\n/, $pirAlignment;
    shift @lines; # remove header line
    shift @lines; # remove blank line
    pop @lines; # remove ending *
    my $sequenceAlignmentOnly = join "\n", @lines;
    $sequenceAlignmentOnly =~ s/[Xx]/-/g;

    my ( $structureInfoLine, $adjustedAlignmentSequence )
    = get_structureInfoAndAdjustedSequence( $pdbCode, $sequenceAlignmentOnly, $pdbFile );
    $eachAlignmentContent .= ">P1;$pdbCode\n";
    $eachAlignmentContent .= $structureInfoLine;
    $eachAlignmentContent .= "$adjustedAlignmentSequence\n*\n";

    return $eachAlignmentContent;
    }

    sub get_structureInfoAndAdjustedSequence
    {
    my ( $pdbCode, $sequenceAlignmentOnly, $pdbFile ) = @_;
    my ( $structureInfoLine, $adjustedAlignmentSequence ) = ( "", "" );

    my ( $startResNum, $startChain, $endResNum, $endChain ) = ("", "", "", "");
    my ( $structureXorN, $proteinName, $source, $resolution, $rFactor ) = ("", "", "", "", "");
    my $allResiduesInPdb = "";
    my $includedResNum = 0;
    my $isChainExist = 1; # true


    open FI, $pdbFile or die "File Open Error, $pdbFile: $!\n";
    while ( my $line = <fi> )
    {
    chomp $line;

    if ($line =~ /^COMPND.*MOLECULE:\s*(\S.*\S);\s*$/)
    {
    $proteinName = $1;
    }
    elsif ($line =~ /^COMPND.*CHAIN:\s+NULL;\s*$/)
    {
    $isChainExist = 0; # false
    }
    elsif ($line =~ /ORGANISM_SCIENTIFIC:\s*(\S.*\S);\s*$/)
    {
    $source = $1;
    }
    elsif ($line =~ /^EXPDTA\s+(\S.*\S)\s*$/)
    {
    my $method = $1;
    $structureXorN =
    ( $method =~ /X-RAY/ ) ? "structureX" :
    ( $method =~ /NMR/ ) ? "structureN" :
    "structure";
    }
    elsif ($line =~ /^REMARK.*RESOLUTION.\s+([\d\.]+)\s+ANGSTROM.*$/)
    {
    $resolution = $1;
    }
    elsif ($line =~ /^REMARK\s+\d+\s+R VALUE\s+\(WORKING SET\)\s*:\s*([\d\.\-]+)\s*$/)
    {
    $rFactor = $1;
    }
    elsif ($line =~ /^ATOM/)
    {
    my @values = split /\s+/, $line;
    my $threeLetterAminoAcidCode = $values[3];
    my $oneLetterAminoAcidCode = convertThreeToOneLetterAminoAcidCode($threeLetterAminoAcidCode);

    if ( $startResNum eq "" && $isChainExist )
    {
    ( $startChain, $startResNum ) = ( $values[4], $values[5] );
    }
    elsif ( $startResNum eq "" )
    {
    $startResNum = $values[4];
    }

    if ( $isChainExist )
    {
    ( $endChain, $endResNum ) = ( $values[4], $values[5] );
    }
    else
    {
    $endResNum = $values[4];
    }

    if ( $endResNum ne $includedResNum )
    {
    $allResiduesInPdb .= $oneLetterAminoAcidCode;
    $includedResNum = $endResNum;
    }
    }
    elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read )
    {
    last;
    }
    }
    close FI;

    $structureInfoLine = "$structureXorN:$pdbCode:$startResNum :$startChain :$endResNum :$endChain ".
    ":$proteinName :$source :$resolution :$rFactor \n";
    $adjustedAlignmentSequence = get_adjustedAlignmentSequence( $sequenceAlignmentOnly, $allResiduesInPdb );

    return ( $structureInfoLine, $adjustedAlignmentSequence );
    }

    sub convertThreeToOneLetterAminoAcidCode
    {
    my ( $threeLetterAminoAcidCode ) = @_;
    return
    ( $threeLetterAminoAcidCode =~ /gly/i ) ? "G" :
    ( $threeLetterAminoAcidCode =~ /pro/i ) ? "P" :
    ( $threeLetterAminoAcidCode =~ /ala/i ) ? "A" :
    ( $threeLetterAminoAcidCode =~ /val/i ) ? "V" :
    ( $threeLetterAminoAcidCode =~ /leu/i ) ? "L" :
    ( $threeLetterAminoAcidCode =~ /ile/i ) ? "I" :
    ( $threeLetterAminoAcidCode =~ /met/i ) ? "M" :
    ( $threeLetterAminoAcidCode =~ /cys/i ) ? "C" :
    ( $threeLetterAminoAcidCode =~ /phe/i ) ? "F" :
    ( $threeLetterAminoAcidCode =~ /tyr/i ) ? "Y" :
    ( $threeLetterAminoAcidCode =~ /trp/i ) ? "W" :
    ( $threeLetterAminoAcidCode =~ /his/i ) ? "H" :
    ( $threeLetterAminoAcidCode =~ /lys/i ) ? "K" :
    ( $threeLetterAminoAcidCode =~ /arg/i ) ? "R" :
    ( $threeLetterAminoAcidCode =~ /gln/i ) ? "Q" :
    ( $threeLetterAminoAcidCode =~ /asn/i ) ? "N" :
    ( $threeLetterAminoAcidCode =~ /glu/i ) ? "E" :
    ( $threeLetterAminoAcidCode =~ /asp/i ) ? "D" :
    ( $threeLetterAminoAcidCode =~ /ser/i ) ? "S" :
    ( $threeLetterAminoAcidCode =~ /thr/i ) ? "T" :
    "-";
    }

    sub get_adjustedAlignmentSequence
    {
    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
    my $adjustedAlignmentSequence = "";

    my @lines = split /\n/, $sequenceAlignmentOnly;
    my $charactersPerLine = length $lines[0];
    $sequenceAlignmentOnly =~ s/\n//g; # remove new line characters

    # substitute front and end residues without atom coordinates to '-'s
    my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb );
    my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb );
    $adjustedAlignmentSequence = "-" x ($startPos - 1).
    substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based
    "-" x ( length($sequenceAlignmentOnly) - $endPos );

    # substitute loop residues without atom coordinates to '-'s
    my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb );
    foreach my $loopPosition ( @loopPositions )
    {
    my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56
    $adjustedAlignmentSequence =
    substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ).
    "-" x ( $loopEndPos - $loopStartPos + 1).
    substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos );
    }

    my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence;
    $dashRemovedAdjustedAlignmentSequence =~ s/-//g;

    if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb )
    {
    print STDERR "\nError!!!\n";
    print STDERR "Number of residues in the PIR-alignment file and the PDB file mismatches !\n";
    print STDERR "Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n";
    print STDERR "dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n";
    print STDERR "allResiduesInPdb: $allResiduesInPdb\n";
    }

    return formatFastaSequence($adjustedAlignmentSequence, $charactersPerLine);
    }

    sub get_startPos
    {
    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;

    # Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment
    # matching with the residues in the PDB file
    my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 );

    my $startPos = "";
    my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;
    my $pos = 0;

    foreach my $aa ( @allAminoAcidsInTheAlignmentSequence )
    {
    $pos++;
    next if $aa eq "-";

    my $frontRemovedSeq = substr( $sequenceAlignmentOnly, $pos - 1, length($sequenceAlignmentOnly) - $pos + 1 );
    $frontRemovedSeq =~ s/-//g; # remove all dashes

    if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i )
    {
    $startPos = $pos;
    print "\tstart position or non-loop start position determined: $pos\n";
    last;
    }
    }

    if ( $startPos eq "" )
    {
    print STDERR "\nError!\n";
    print STDERR "Start position mismatches!\n";
    print STDERR "Check the start amino acids of the alignment!\n\n";
    print STDERR "sequenceAlignment: $sequenceAlignmentOnly\n";
    print STDERR "residuesInPdb: $allResiduesInPdb\n\n";
    }

    return $startPos;
    }

    sub get_endPos
    {
    my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;

    # Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment
    # matching with the residues in the PDB file
    my $reversedLastTenAminoAcidsOfAllResiduesInPdb =
    reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 ));

    my $endPos = "";
    my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;
    my $pos = @allAminoAcidsInTheAlignmentSequence + 1;

    foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence )
    {
    $pos--;
    next if $aa eq "-";

    my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) );
    $endRemovedReverseSeq =~ s/-//g; # remove all dashes

    if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i )
    {
    $endPos = $pos;
    print "\tend position determined: $pos\n";
    last;
    }
    }

    if ( $endPos eq "" )
    {
    print STDERR "\nError!\n";
    print STDERR "End position mismatches between the alignment file and the pdb file!\n";
    print STDERR "Check the end amino acids of the alignment!\n\n";
    print STDERR "sequenceAlignmentOnly: $sequenceAlignmentOnly\n";
    print STDERR "allResiduesInPdb: $allResiduesInPdb\n\n";
    }

    return $endPos;
    }

    sub reverseMyString
    {
    my ( $string ) = @_;
    my @allChars = split //, $string;

    return join("", reverse @allChars );
    }

    sub get_loopPositions
    {
    my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_;
    my @loopPositions = ();

    my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence;
    $dashRemovedAdjustedSequence =~ s/-//g;

    if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb )
    {
    return @loopPositions;
    }

    my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence;
    my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb;
    my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 );

    while ( $alignmentResiduePos <= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number
    {
    $alignmentResiduePos++;
    next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq "-";
    $pdbFileResiduePos++;

    if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne
    $allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] )
    {
    my $frontRemovedAlignmentSeq =
    substr( $adjustedAlignmentSequence,
    $alignmentResiduePos - 1,
    length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 );
    my $frontRemovedPdbResidueSeq =
    substr( $allResiduesInPdb,
    $pdbFileResiduePos - 1,
    length($allResiduesInPdb) - $pdbFileResiduePos + 1 );

    my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1;
    my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1);

    if ( $loopLength ne "" && $loopStartPos <= $loopEndPos )
    {
    push @loopPositions, "$loopStartPos-$loopEndPos";
    print "\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n";

    $alignmentResiduePos += $loopLength - 1;
    $pdbFileResiduePos--;
    }
    }
    }

    return @loopPositions;
    }

    sub formatFastaSequence
    {
    my ($seq, $charactersPerLine) = @_;
    $seq =~ s/(.{$charactersPerLine})/$1\n/g;
    return $seq;
    }

    sub makeAlignmentFile
    {
    my ( $alignmentFile, $alignmentFileContent ) = @_;

    open FO, ">$alignmentFile" or die "File Write Error, $alignmentFile: $!";
    print FO $alignmentFileContent;
    close FO;
    }

    sub makeTopScriptFile
    {
    my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_;

    open FO, ">$topScriptFile" or die "File Write Error, $topScriptFile: $!";
    print FO printTopScriptInvariableContentFront();

    # print variable contents of the top script file
    print FO "SET ALNFILE = '$alignmentFile' # alignment filename\n";
    print FO "SET KNOWNS = ";
    foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref )
    {
    print FO "'$pdbCode' ";
    }
    print FO " # codes of the templates\n";
    print FO "SET SEQUENCE = '$targetProteinCode' # code of the target\n";

    print FO printTopScriptInvariableContentEnd();
    close FO;
    }

    sub printTopScriptInvariableContentFront
    {
    return "".
    "# Homology modelling by the MODELLER TOP routine 'model'.\n".
    "\n".
    "INCLUDE # Include the predefined TOP routines\n".
    "\n".
    "SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file\n".
    "\n";
    }

    sub printTopScriptInvariableContentEnd
    {
    return "".
    "SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n".
    "SET STARTING_MODEL= 1 # index of the first model\n".
    "SET ENDING_MODEL = 1 # index of the last model\n".
    " # (determines how many models to calculate)\n".
    "CALL ROUTINE = 'model' # do homology modelling\n".
    "\n";
    }

    </fi></fi>