Difference between revisions of "Perl script for ATM"
From Opengenome.net
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
− | + | #!/usr/bin/perl -w<br /><br />use strict;<br /><br />doMain();<br /><br />sub doMain<br />{<br /> my $baseDirPath = shift @ARGV || "";<br /> my $pirAlignmentFile = shift @ARGV || "";<br /> my $targetProteinCode = shift @ARGV || "";<br /> my $pdbFiles_ref = \@ARGV || ();<br /><br /> print "----\n";<br /> print "\nChecking command-line parameters ...\n";<br /> printUsageAndExit() unless ( $baseDirPath && -d $baseDirPath );<br /> $baseDirPath =~ s/\/$//; # remove the ending slash if exists<br /><br /> printUsageAndExit() unless ( $pirAlignmentFile && -f "$baseDirPath/$pirAlignmentFile" && $pirAlignmentFile =~ /\.pir$/ );<br /> printUsageAndExit() unless ( $targetProteinCode && length($targetProteinCode) == 4 );<br /> printUsageAndExit() unless ( @$pdbFiles_ref > 0 );<br /><br /> foreach my $pdbFile ( @$pdbFiles_ref )<br /> {<br /> printUsageAndExit() unless ( -f "$baseDirPath/$pdbFile" && $pdbFile =~ /\.pdb$/ );<br /> }<br /><br /> print "Making modeller input files ...\n";<br /> makeModellerInput( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ); <br />}<br /><br />sub printUsageAndExit<br />{<br /> print STDERR "Error!\n\n";<br /> print STDERR "Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n"; <br /> print STDERR " A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n";<br /> print STDERR " A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n";<br /> print STDERR " A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n";<br /> print STDERR " A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n";<br /> print STDERR "----\n";<br /> exit;<br />}<br /><br />sub makeModellerInput<br />{<br /> my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_;<br /> print "\tchanging the directory to $baseDirPath ...\n";<br /> chdir $baseDirPath or die "Dir Change Error, $baseDirPath: $!";<br /><br /> my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/;<br /><br /> # modeller input files<br /> my $alignmentFile = $outputPrefix. ".ali";<br /> my $topScriptFile = $outputPrefix. ".top";<br /> my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, "atm" );<br /><br /> my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, "pdb" );<br /> my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile );<br /><br /> # parse the target protein's alignment and information first<br /> my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} );<br /><br /> # parse alignments and information of known proteins and make atom files of them<br /> foreach my $pdbCode ( keys %pdbCode2PdbFile )<br /> {<br /> my $atomFile = $pdbCode2AtomFile{ $pdbCode };<br /> my $pdbFile = $pdbCode2PdbFile{ $pdbCode };<br /> my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode };<br /><br /> # make the atom files<br /> print "\tmaking the atom file, $atomFile ...\n";<br /> makeAtomFile( $pdbFile, $atomFile );<br /><br /> $alignmentFileContent .= get_eachAlignmentContent( $pdbCode, $pirAlignment, $pdbFile );<br /> }<br /><br /> # make the alignment file<br /> print "\tmaking the alignment file, $alignmentFile ...\n";<br /> makeAlignmentFile( $alignmentFile, $alignmentFileContent );<br /><br /> # make the top script file <br /> print "\tmaking the top script file, $topScriptFile ...\n";<br /> makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile );<br /><br /> print "Finished!\n\n";<br /> print "----\n";<br />}<br /><br />sub get_pdbCode2File<br />{<br /> my ( $pdbFiles_ref, $fileExtension ) = @_;<br /> my %pdbCode2File = ();<br /><br /> foreach my $pdbFile ( @$pdbFiles_ref )<br /> {<br /> my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters<br /> my $file = $pdbCode. ".". $fileExtension;<br /> $pdbCode2File{ $pdbCode } = $file;<br /> }<br /><br /> return %pdbCode2File;<br />}<br /><br />sub get_pdbCode2PirAlignment<br />{<br /> my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_;<br /> my %pdbCode2PirAlignment = ();<br /><br /> my @pdbCodes = ();<br /> push @pdbCodes, $targetProteinCode;<br /> push @pdbCodes, (sort keys %$pdbCode2PdbFile_ref );<br /><br /> $/ = "*\n"; # change the reading-block separator<br /> open FI, $pirAlignmentFile or die "File Open Error, $pirAlignmentFile: $!\n";<br /><br /> while ( my $entry = <fi> )<br /> {<br /> chomp $entry;<br /> next if $entry =~ /^\s*$/;<br /><br /> my @lines = split /\n/, $entry;<br /><br /> my $pdbCode = "";<br /> foreach my $code ( @pdbCodes )<br /> {<br /> if ( $lines[0] =~ /$code/ )<br /> {<br /> $pdbCode = $code;<br /> last;<br /> }<br /> }<br /><br /> if ( $pdbCode eq "" )<br /> {<br /> print STDERR "PDB code mistmatch with pdb files on this entry!\n";<br /> print STDERR "pdbCode: $pdbCode\n";<br /> print STDERR "entry: $entry\n";<br /> }<br /><br /> $pdbCode2PirAlignment{ $pdbCode } = "$entry*\n";<br /> print "\nPDB code To PIR alignment found: $pdbCode =>\n$entry\n";<br /> }<br /><br /> close FI;<br /> $/ = "\n"; # restore the reading-block separator<br /><br /> return %pdbCode2PirAlignment;<br />}<br /><br />sub getTargetProteinAlignment<br />{<br /> my ( $targetProteinCode, $targetProteinAlignment ) = @_;<br /> my $targetProteinAlignmentContent = "";<br /><br /> my @lines = split /\n/, $targetProteinAlignment;<br /> shift @lines; # remove header line<br /> shift @lines; # remove blank line<br /> my $sequenceAlignmentOnly = join "\n", @lines;<br /><br /> $targetProteinAlignmentContent .= ">P1;$targetProteinCode\n";<br /> $targetProteinAlignmentContent .= "sequence: : : : : : : : :\n";<br /> $targetProteinAlignmentContent .= "$sequenceAlignmentOnly\n";<br /><br /> return $targetProteinAlignmentContent;<br />}<br /><br />sub makeAtomFile<br />{<br /> my ( $pdbFile, $atomFile ) = @_;<br /> !system("cp $pdbFile $atomFile") or die "File Copy Error: $!";<br /> print "\tcopying $pdbFile to $atomFile ...\n";<br />}<br /><br />sub get_eachAlignmentContent<br />{<br /> my ( $pdbCode, $pirAlignment, $pdbFile ) = @_;<br /> my $eachAlignmentContent = "";<br /><br /> my @lines = split /\n/, $pirAlignment;<br /> shift @lines; # remove header line<br /> shift @lines; # remove blank line<br /> pop @lines; # remove ending *<br /> my $sequenceAlignmentOnly = join "\n", @lines;<br /> $sequenceAlignmentOnly =~ s/[Xx]/-/g;<br /><br /> my ( $structureInfoLine, $adjustedAlignmentSequence ) <br /> = get_structureInfoAndAdjustedSequence( $pdbCode, $sequenceAlignmentOnly, $pdbFile );<br /> $eachAlignmentContent .= ">P1;$pdbCode\n";<br /> $eachAlignmentContent .= $structureInfoLine;<br /> $eachAlignmentContent .= "$adjustedAlignmentSequence\n*\n";<br /><br /> return $eachAlignmentContent;<br />}<br /><br />sub get_structureInfoAndAdjustedSequence<br />{<br /> my ( $pdbCode, $sequenceAlignmentOnly, $pdbFile ) = @_;<br /> my ( $structureInfoLine, $adjustedAlignmentSequence ) = ( "", "" );<br /><br /> my ( $startResNum, $startChain, $endResNum, $endChain ) = ("", "", "", "");<br /> my ( $structureXorN, $proteinName, $source, $resolution, $rFactor ) = ("", "", "", "", "");<br /> my $allResiduesInPdb = "";<br /> my $includedResNum = 0;<br /> my $isChainExist = 1; # true<br /> <br /><br /> open FI, $pdbFile or die "File Open Error, $pdbFile: $!\n";<br /> while ( my $line = <fi> )<br /> {<br /> chomp $line;<br /><br /> if ($line =~ /^COMPND.*MOLECULE:\s*(\S.*\S);\s*$/)<br /> {<br /> $proteinName = $1;<br /> }<br /> elsif ($line =~ /^COMPND.*CHAIN:\s+NULL;\s*$/)<br /> {<br /> $isChainExist = 0; # false<br /> }<br /> elsif ($line =~ /ORGANISM_SCIENTIFIC:\s*(\S.*\S);\s*$/)<br /> {<br /> $source = $1;<br /> }<br /> elsif ($line =~ /^EXPDTA\s+(\S.*\S)\s*$/)<br /> {<br /> my $method = $1;<br /> $structureXorN = <br /> ( $method =~ /X-RAY/ ) ? "structureX" :<br /> ( $method =~ /NMR/ ) ? "structureN" :<br /> "structure";<br /> }<br /> elsif ($line =~ /^REMARK.*RESOLUTION.\s+([\d\.]+)\s+ANGSTROM.*$/)<br /> {<br /> $resolution = $1;<br /> }<br /> elsif ($line =~ /^REMARK\s+\d+\s+R VALUE\s+\(WORKING SET\)\s*:\s*([\d\.\-]+)\s*$/)<br /> {<br /> $rFactor = $1;<br /> }<br /> elsif ($line =~ /^ATOM/)<br /> {<br /> my @values = split /\s+/, $line;<br /> my $threeLetterAminoAcidCode = $values[3];<br /> my $oneLetterAminoAcidCode = convertThreeToOneLetterAminoAcidCode($threeLetterAminoAcidCode);<br /><br /> if ( $startResNum eq "" && $isChainExist )<br /> {<br /> ( $startChain, $startResNum ) = ( $values[4], $values[5] );<br /> }<br /> elsif ( $startResNum eq "" )<br /> {<br /> $startResNum = $values[4];<br /> }<br /><br /> if ( $isChainExist )<br /> {<br /> ( $endChain, $endResNum ) = ( $values[4], $values[5] );<br /> }<br /> else<br /> {<br /> $endResNum = $values[4];<br /> }<br /><br /> if ( $endResNum ne $includedResNum )<br /> {<br /> $allResiduesInPdb .= $oneLetterAminoAcidCode;<br /> $includedResNum = $endResNum;<br /> }<br /> }<br /> elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read )<br /> {<br /> last;<br /> }<br /> }<br /> close FI;<br /><br /> $structureInfoLine = "$structureXorN:$pdbCode:$startResNum :$startChain :$endResNum :$endChain ".<br /> ":$proteinName :$source :$resolution :$rFactor \n";<br /> $adjustedAlignmentSequence = get_adjustedAlignmentSequence( $sequenceAlignmentOnly, $allResiduesInPdb );<br /><br /> return ( $structureInfoLine, $adjustedAlignmentSequence );<br />}<br /><br />sub convertThreeToOneLetterAminoAcidCode<br />{<br /> my ( $threeLetterAminoAcidCode ) = @_;<br /> return<br /> ( $threeLetterAminoAcidCode =~ /gly/i ) ? "G" :<br /> ( $threeLetterAminoAcidCode =~ /pro/i ) ? "P" :<br /> ( $threeLetterAminoAcidCode =~ /ala/i ) ? "A" :<br /> ( $threeLetterAminoAcidCode =~ /val/i ) ? "V" :<br /> ( $threeLetterAminoAcidCode =~ /leu/i ) ? "L" :<br /> ( $threeLetterAminoAcidCode =~ /ile/i ) ? "I" :<br /> ( $threeLetterAminoAcidCode =~ /met/i ) ? "M" :<br /> ( $threeLetterAminoAcidCode =~ /cys/i ) ? "C" :<br /> ( $threeLetterAminoAcidCode =~ /phe/i ) ? "F" :<br /> ( $threeLetterAminoAcidCode =~ /tyr/i ) ? "Y" :<br /> ( $threeLetterAminoAcidCode =~ /trp/i ) ? "W" :<br /> ( $threeLetterAminoAcidCode =~ /his/i ) ? "H" :<br /> ( $threeLetterAminoAcidCode =~ /lys/i ) ? "K" :<br /> ( $threeLetterAminoAcidCode =~ /arg/i ) ? "R" :<br /> ( $threeLetterAminoAcidCode =~ /gln/i ) ? "Q" :<br /> ( $threeLetterAminoAcidCode =~ /asn/i ) ? "N" :<br /> ( $threeLetterAminoAcidCode =~ /glu/i ) ? "E" :<br /> ( $threeLetterAminoAcidCode =~ /asp/i ) ? "D" :<br /> ( $threeLetterAminoAcidCode =~ /ser/i ) ? "S" :<br /> ( $threeLetterAminoAcidCode =~ /thr/i ) ? "T" :<br /> "-";<br />}<br /><br />sub get_adjustedAlignmentSequence<br />{<br /> my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;<br /> my $adjustedAlignmentSequence = "";<br /><br /> my @lines = split /\n/, $sequenceAlignmentOnly;<br /> my $charactersPerLine = length $lines[0];<br /> $sequenceAlignmentOnly =~ s/\n//g; # remove new line characters<br /><br /> # substitute front and end residues without atom coordinates to '-'s <br /> my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb ); <br /> my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb ); <br /> $adjustedAlignmentSequence = "-" x ($startPos - 1).<br /> substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based<br /> "-" x ( length($sequenceAlignmentOnly) - $endPos );<br /><br /> # substitute loop residues without atom coordinates to '-'s <br /> my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb );<br /> foreach my $loopPosition ( @loopPositions )<br /> {<br /> my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56<br /> $adjustedAlignmentSequence = <br /> substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ).<br /> "-" x ( $loopEndPos - $loopStartPos + 1).<br /> substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos );<br /> }<br /><br /> my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence;<br /> $dashRemovedAdjustedAlignmentSequence =~ s/-//g;<br /><br /> if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb )<br /> {<br /> print STDERR "\nError!!!\n";<br /> print STDERR "Number of residues in the PIR-alignment file and the PDB file mismatches !\n";<br /> print STDERR "Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n";<br /> print STDERR "dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n";<br /> print STDERR "allResiduesInPdb: $allResiduesInPdb\n";<br /> }<br /><br /> return formatFastaSequence($adjustedAlignmentSequence, $charactersPerLine);<br />}<br /><br />sub get_startPos<br />{<br /> my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;<br /><br /> # Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment<br /> # matching with the residues in the PDB file <br /> my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 );<br /><br /> my $startPos = "";<br /> my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;<br /> my $pos = 0;<br /><br /> foreach my $aa ( @allAminoAcidsInTheAlignmentSequence )<br /> {<br /> $pos++;<br /> next if $aa eq "-";<br /><br /> my $frontRemovedSeq = substr( $sequenceAlignmentOnly, $pos - 1, length($sequenceAlignmentOnly) - $pos + 1 );<br /> $frontRemovedSeq =~ s/-//g; # remove all dashes<br /><br /> if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i )<br /> {<br /> $startPos = $pos;<br /> print "\tstart position or non-loop start position determined: $pos\n";<br /> last;<br /> }<br /> }<br /><br /> if ( $startPos eq "" )<br /> {<br /> print STDERR "\nError!\n";<br /> print STDERR "Start position mismatches!\n";<br /> print STDERR "Check the start amino acids of the alignment!\n\n";<br /> print STDERR "sequenceAlignment: $sequenceAlignmentOnly\n";<br /> print STDERR "residuesInPdb: $allResiduesInPdb\n\n";<br /> }<br /><br /> return $startPos;<br />}<br /><br />sub get_endPos<br />{<br /> my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;<br /><br /> # Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment<br /> # matching with the residues in the PDB file <br /> my $reversedLastTenAminoAcidsOfAllResiduesInPdb = <br /> reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 ));<br /><br /> my $endPos = "";<br /> my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;<br /> my $pos = @allAminoAcidsInTheAlignmentSequence + 1;<br /><br /> foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence )<br /> {<br /> $pos--;<br /> next if $aa eq "-";<br /><br /> my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) );<br /> $endRemovedReverseSeq =~ s/-//g; # remove all dashes<br /><br /> if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i )<br /> {<br /> $endPos = $pos;<br /> print "\tend position determined: $pos\n";<br /> last;<br /> }<br /> }<br /><br /> if ( $endPos eq "" )<br /> {<br /> print STDERR "\nError!\n";<br /> print STDERR "End position mismatches between the alignment file and the pdb file!\n";<br /> print STDERR "Check the end amino acids of the alignment!\n\n";<br /> print STDERR "sequenceAlignmentOnly: $sequenceAlignmentOnly\n";<br /> print STDERR "allResiduesInPdb: $allResiduesInPdb\n\n";<br /> }<br /><br /> return $endPos;<br />}<br /><br />sub reverseMyString<br />{<br /> my ( $string ) = @_;<br /> my @allChars = split //, $string;<br /><br /> return join("", reverse @allChars );<br />}<br /><br />sub get_loopPositions<br />{<br /> my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_;<br /> my @loopPositions = ();<br /><br /> my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence;<br /> $dashRemovedAdjustedSequence =~ s/-//g;<br /><br /> if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb )<br /> {<br /> return @loopPositions;<br /> }<br /><br /> my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence;<br /> my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb;<br /> my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 );<br /><br /> while ( $alignmentResiduePos <= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number<br /> {<br /> $alignmentResiduePos++;<br /> next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq "-";<br /> $pdbFileResiduePos++;<br /><br /> if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne <br /> $allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] )<br /> {<br /> my $frontRemovedAlignmentSeq = <br /> substr( $adjustedAlignmentSequence, <br /> $alignmentResiduePos - 1, <br /> length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 );<br /> my $frontRemovedPdbResidueSeq = <br /> substr( $allResiduesInPdb,<br /> $pdbFileResiduePos - 1,<br /> length($allResiduesInPdb) - $pdbFileResiduePos + 1 );<br /><br /> my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1;<br /> my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1);<br /><br /> if ( $loopLength ne "" && $loopStartPos <= $loopEndPos )<br /> {<br /> push @loopPositions, "$loopStartPos-$loopEndPos";<br /> print "\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n";<br /><br /> $alignmentResiduePos += $loopLength - 1;<br /> $pdbFileResiduePos--;<br /> }<br /> }<br /> }<br /><br /> return @loopPositions;<br />}<br /><br />sub formatFastaSequence<br />{<br /> my ($seq, $charactersPerLine) = @_;<br /> $seq =~ s/(.{$charactersPerLine})/$1\n/g;<br /> return $seq;<br />}<br /><br />sub makeAlignmentFile<br />{<br /> my ( $alignmentFile, $alignmentFileContent ) = @_;<br /><br /> open FO, ">$alignmentFile" or die "File Write Error, $alignmentFile: $!";<br /> print FO $alignmentFileContent;<br /> close FO;<br />}<br /><br />sub makeTopScriptFile<br />{<br /> my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_;<br /><br /> open FO, ">$topScriptFile" or die "File Write Error, $topScriptFile: $!";<br /> print FO printTopScriptInvariableContentFront();<br /><br /> # print variable contents of the top script file<br /> print FO "SET ALNFILE = '$alignmentFile' # alignment filename\n";<br /> print FO "SET KNOWNS = ";<br /> foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref )<br /> {<br /> print FO "'$pdbCode' ";<br /> }<br /> print FO " # codes of the templates\n";<br /> print FO "SET SEQUENCE = '$targetProteinCode' # code of the target\n";<br /><br /> print FO printTopScriptInvariableContentEnd();<br /> close FO;<br />}<br /><br />sub printTopScriptInvariableContentFront<br />{<br /> return "".<br /> "# Homology modelling by the MODELLER TOP routine 'model'.\n".<br /> "\n".<br /> "INCLUDE # Include the predefined TOP routines\n".<br /> "\n".<br /> "SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file\n".<br /> "\n";<br />}<br /><br />sub printTopScriptInvariableContentEnd<br />{<br /> return "".<br /> "SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n".<br /> "SET STARTING_MODEL= 1 # index of the first model\n".<br /> "SET ENDING_MODEL = 1 # index of the last model\n".<br /> " # (determines how many models to calculate)\n".<br /> "CALL ROUTINE = 'model' # do homology modelling\n".<br /> "\n";<br />}<br /><br /></fi></fi> | |
− | #!/usr/bin/perl -w | ||
− | |||
− | use strict; | ||
− | |||
− | doMain(); | ||
− | |||
− | sub doMain | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub printUsageAndExit | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub makeModellerInput | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_pdbCode2File | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_pdbCode2PirAlignment | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub getTargetProteinAlignment | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub makeAtomFile | ||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_eachAlignmentContent | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_structureInfoAndAdjustedSequence | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub convertThreeToOneLetterAminoAcidCode | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_adjustedAlignmentSequence | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_startPos | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_endPos | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub reverseMyString | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub get_loopPositions | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub formatFastaSequence | ||
− | { | ||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub makeAlignmentFile | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub makeTopScriptFile | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub printTopScriptInvariableContentFront | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | sub printTopScriptInvariableContentEnd | ||
− | { | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | } | ||
− | |||
− | </ |
Latest revision as of 17:52, 12 May 2006
- !/usr/bin/perl -w
use strict;
doMain();
sub doMain
{
my $baseDirPath = shift @ARGV || "";
my $pirAlignmentFile = shift @ARGV || "";
my $targetProteinCode = shift @ARGV || "";
my $pdbFiles_ref = \@ARGV || ();
print "----\n";
print "\nChecking command-line parameters ...\n";
printUsageAndExit() unless ( $baseDirPath && -d $baseDirPath );
$baseDirPath =~ s/\/$//; # remove the ending slash if exists
printUsageAndExit() unless ( $pirAlignmentFile && -f "$baseDirPath/$pirAlignmentFile" && $pirAlignmentFile =~ /\.pir$/ );
printUsageAndExit() unless ( $targetProteinCode && length($targetProteinCode) == 4 );
printUsageAndExit() unless ( @$pdbFiles_ref > 0 );
foreach my $pdbFile ( @$pdbFiles_ref )
{
printUsageAndExit() unless ( -f "$baseDirPath/$pdbFile" && $pdbFile =~ /\.pdb$/ );
}
print "Making modeller input files ...\n";
makeModellerInput( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref );
}
sub printUsageAndExit
{
print STDERR "Error!\n\n";
print STDERR "Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n";
print STDERR " A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n";
print STDERR " A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n";
print STDERR " A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n";
print STDERR " A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n";
print STDERR "----\n";
exit;
}
sub makeModellerInput
{
my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_;
print "\tchanging the directory to $baseDirPath ...\n";
chdir $baseDirPath or die "Dir Change Error, $baseDirPath: $!";
my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/;
# modeller input files
my $alignmentFile = $outputPrefix. ".ali";
my $topScriptFile = $outputPrefix. ".top";
my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, "atm" );
my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, "pdb" );
my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile );
# parse the target protein's alignment and information first
my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} );
# parse alignments and information of known proteins and make atom files of them
foreach my $pdbCode ( keys %pdbCode2PdbFile )
{
my $atomFile = $pdbCode2AtomFile{ $pdbCode };
my $pdbFile = $pdbCode2PdbFile{ $pdbCode };
my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode };
# make the atom files
print "\tmaking the atom file, $atomFile ...\n";
makeAtomFile( $pdbFile, $atomFile );
$alignmentFileContent .= get_eachAlignmentContent( $pdbCode, $pirAlignment, $pdbFile );
}
# make the alignment file
print "\tmaking the alignment file, $alignmentFile ...\n";
makeAlignmentFile( $alignmentFile, $alignmentFileContent );
# make the top script file
print "\tmaking the top script file, $topScriptFile ...\n";
makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile );
print "Finished!\n\n";
print "----\n";
}
sub get_pdbCode2File
{
my ( $pdbFiles_ref, $fileExtension ) = @_;
my %pdbCode2File = ();
foreach my $pdbFile ( @$pdbFiles_ref )
{
my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters
my $file = $pdbCode. ".". $fileExtension;
$pdbCode2File{ $pdbCode } = $file;
}
return %pdbCode2File;
}
sub get_pdbCode2PirAlignment
{
my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_;
my %pdbCode2PirAlignment = ();
my @pdbCodes = ();
push @pdbCodes, $targetProteinCode;
push @pdbCodes, (sort keys %$pdbCode2PdbFile_ref );
$/ = "*\n"; # change the reading-block separator
open FI, $pirAlignmentFile or die "File Open Error, $pirAlignmentFile: $!\n";
while ( my $entry = <fi> )
{
chomp $entry;
next if $entry =~ /^\s*$/;
my @lines = split /\n/, $entry;
my $pdbCode = "";
foreach my $code ( @pdbCodes )
{
if ( $lines[0] =~ /$code/ )
{
$pdbCode = $code;
last;
}
}
if ( $pdbCode eq "" )
{
print STDERR "PDB code mistmatch with pdb files on this entry!\n";
print STDERR "pdbCode: $pdbCode\n";
print STDERR "entry: $entry\n";
}
$pdbCode2PirAlignment{ $pdbCode } = "$entry*\n";
print "\nPDB code To PIR alignment found: $pdbCode =>\n$entry\n";
}
close FI;
$/ = "\n"; # restore the reading-block separator
return %pdbCode2PirAlignment;
}
sub getTargetProteinAlignment
{
my ( $targetProteinCode, $targetProteinAlignment ) = @_;
my $targetProteinAlignmentContent = "";
my @lines = split /\n/, $targetProteinAlignment;
shift @lines; # remove header line
shift @lines; # remove blank line
my $sequenceAlignmentOnly = join "\n", @lines;
$targetProteinAlignmentContent .= ">P1;$targetProteinCode\n";
$targetProteinAlignmentContent .= "sequence: : : : : : : : :\n";
$targetProteinAlignmentContent .= "$sequenceAlignmentOnly\n";
return $targetProteinAlignmentContent;
}
sub makeAtomFile
{
my ( $pdbFile, $atomFile ) = @_;
!system("cp $pdbFile $atomFile") or die "File Copy Error: $!";
print "\tcopying $pdbFile to $atomFile ...\n";
}
sub get_eachAlignmentContent
{
my ( $pdbCode, $pirAlignment, $pdbFile ) = @_;
my $eachAlignmentContent = "";
my @lines = split /\n/, $pirAlignment;
shift @lines; # remove header line
shift @lines; # remove blank line
pop @lines; # remove ending *
my $sequenceAlignmentOnly = join "\n", @lines;
$sequenceAlignmentOnly =~ s/[Xx]/-/g;
my ( $structureInfoLine, $adjustedAlignmentSequence )
= get_structureInfoAndAdjustedSequence( $pdbCode, $sequenceAlignmentOnly, $pdbFile );
$eachAlignmentContent .= ">P1;$pdbCode\n";
$eachAlignmentContent .= $structureInfoLine;
$eachAlignmentContent .= "$adjustedAlignmentSequence\n*\n";
return $eachAlignmentContent;
}
sub get_structureInfoAndAdjustedSequence
{
my ( $pdbCode, $sequenceAlignmentOnly, $pdbFile ) = @_;
my ( $structureInfoLine, $adjustedAlignmentSequence ) = ( "", "" );
my ( $startResNum, $startChain, $endResNum, $endChain ) = ("", "", "", "");
my ( $structureXorN, $proteinName, $source, $resolution, $rFactor ) = ("", "", "", "", "");
my $allResiduesInPdb = "";
my $includedResNum = 0;
my $isChainExist = 1; # true
open FI, $pdbFile or die "File Open Error, $pdbFile: $!\n";
while ( my $line = <fi> )
{
chomp $line;
if ($line =~ /^COMPND.*MOLECULE:\s*(\S.*\S);\s*$/)
{
$proteinName = $1;
}
elsif ($line =~ /^COMPND.*CHAIN:\s+NULL;\s*$/)
{
$isChainExist = 0; # false
}
elsif ($line =~ /ORGANISM_SCIENTIFIC:\s*(\S.*\S);\s*$/)
{
$source = $1;
}
elsif ($line =~ /^EXPDTA\s+(\S.*\S)\s*$/)
{
my $method = $1;
$structureXorN =
( $method =~ /X-RAY/ ) ? "structureX" :
( $method =~ /NMR/ ) ? "structureN" :
"structure";
}
elsif ($line =~ /^REMARK.*RESOLUTION.\s+([\d\.]+)\s+ANGSTROM.*$/)
{
$resolution = $1;
}
elsif ($line =~ /^REMARK\s+\d+\s+R VALUE\s+\(WORKING SET\)\s*:\s*([\d\.\-]+)\s*$/)
{
$rFactor = $1;
}
elsif ($line =~ /^ATOM/)
{
my @values = split /\s+/, $line;
my $threeLetterAminoAcidCode = $values[3];
my $oneLetterAminoAcidCode = convertThreeToOneLetterAminoAcidCode($threeLetterAminoAcidCode);
if ( $startResNum eq "" && $isChainExist )
{
( $startChain, $startResNum ) = ( $values[4], $values[5] );
}
elsif ( $startResNum eq "" )
{
$startResNum = $values[4];
}
if ( $isChainExist )
{
( $endChain, $endResNum ) = ( $values[4], $values[5] );
}
else
{
$endResNum = $values[4];
}
if ( $endResNum ne $includedResNum )
{
$allResiduesInPdb .= $oneLetterAminoAcidCode;
$includedResNum = $endResNum;
}
}
elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read )
{
last;
}
}
close FI;
$structureInfoLine = "$structureXorN:$pdbCode:$startResNum :$startChain :$endResNum :$endChain ".
":$proteinName :$source :$resolution :$rFactor \n";
$adjustedAlignmentSequence = get_adjustedAlignmentSequence( $sequenceAlignmentOnly, $allResiduesInPdb );
return ( $structureInfoLine, $adjustedAlignmentSequence );
}
sub convertThreeToOneLetterAminoAcidCode
{
my ( $threeLetterAminoAcidCode ) = @_;
return
( $threeLetterAminoAcidCode =~ /gly/i ) ? "G" :
( $threeLetterAminoAcidCode =~ /pro/i ) ? "P" :
( $threeLetterAminoAcidCode =~ /ala/i ) ? "A" :
( $threeLetterAminoAcidCode =~ /val/i ) ? "V" :
( $threeLetterAminoAcidCode =~ /leu/i ) ? "L" :
( $threeLetterAminoAcidCode =~ /ile/i ) ? "I" :
( $threeLetterAminoAcidCode =~ /met/i ) ? "M" :
( $threeLetterAminoAcidCode =~ /cys/i ) ? "C" :
( $threeLetterAminoAcidCode =~ /phe/i ) ? "F" :
( $threeLetterAminoAcidCode =~ /tyr/i ) ? "Y" :
( $threeLetterAminoAcidCode =~ /trp/i ) ? "W" :
( $threeLetterAminoAcidCode =~ /his/i ) ? "H" :
( $threeLetterAminoAcidCode =~ /lys/i ) ? "K" :
( $threeLetterAminoAcidCode =~ /arg/i ) ? "R" :
( $threeLetterAminoAcidCode =~ /gln/i ) ? "Q" :
( $threeLetterAminoAcidCode =~ /asn/i ) ? "N" :
( $threeLetterAminoAcidCode =~ /glu/i ) ? "E" :
( $threeLetterAminoAcidCode =~ /asp/i ) ? "D" :
( $threeLetterAminoAcidCode =~ /ser/i ) ? "S" :
( $threeLetterAminoAcidCode =~ /thr/i ) ? "T" :
"-";
}
sub get_adjustedAlignmentSequence
{
my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
my $adjustedAlignmentSequence = "";
my @lines = split /\n/, $sequenceAlignmentOnly;
my $charactersPerLine = length $lines[0];
$sequenceAlignmentOnly =~ s/\n//g; # remove new line characters
# substitute front and end residues without atom coordinates to '-'s
my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb );
my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb );
$adjustedAlignmentSequence = "-" x ($startPos - 1).
substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based
"-" x ( length($sequenceAlignmentOnly) - $endPos );
# substitute loop residues without atom coordinates to '-'s
my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb );
foreach my $loopPosition ( @loopPositions )
{
my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56
$adjustedAlignmentSequence =
substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ).
"-" x ( $loopEndPos - $loopStartPos + 1).
substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos );
}
my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence;
$dashRemovedAdjustedAlignmentSequence =~ s/-//g;
if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb )
{
print STDERR "\nError!!!\n";
print STDERR "Number of residues in the PIR-alignment file and the PDB file mismatches !\n";
print STDERR "Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n";
print STDERR "dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n";
print STDERR "allResiduesInPdb: $allResiduesInPdb\n";
}
return formatFastaSequence($adjustedAlignmentSequence, $charactersPerLine);
}
sub get_startPos
{
my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
# Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment
# matching with the residues in the PDB file
my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 );
my $startPos = "";
my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;
my $pos = 0;
foreach my $aa ( @allAminoAcidsInTheAlignmentSequence )
{
$pos++;
next if $aa eq "-";
my $frontRemovedSeq = substr( $sequenceAlignmentOnly, $pos - 1, length($sequenceAlignmentOnly) - $pos + 1 );
$frontRemovedSeq =~ s/-//g; # remove all dashes
if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i )
{
$startPos = $pos;
print "\tstart position or non-loop start position determined: $pos\n";
last;
}
}
if ( $startPos eq "" )
{
print STDERR "\nError!\n";
print STDERR "Start position mismatches!\n";
print STDERR "Check the start amino acids of the alignment!\n\n";
print STDERR "sequenceAlignment: $sequenceAlignmentOnly\n";
print STDERR "residuesInPdb: $allResiduesInPdb\n\n";
}
return $startPos;
}
sub get_endPos
{
my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
# Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment
# matching with the residues in the PDB file
my $reversedLastTenAminoAcidsOfAllResiduesInPdb =
reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 ));
my $endPos = "";
my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly;
my $pos = @allAminoAcidsInTheAlignmentSequence + 1;
foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence )
{
$pos--;
next if $aa eq "-";
my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) );
$endRemovedReverseSeq =~ s/-//g; # remove all dashes
if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i )
{
$endPos = $pos;
print "\tend position determined: $pos\n";
last;
}
}
if ( $endPos eq "" )
{
print STDERR "\nError!\n";
print STDERR "End position mismatches between the alignment file and the pdb file!\n";
print STDERR "Check the end amino acids of the alignment!\n\n";
print STDERR "sequenceAlignmentOnly: $sequenceAlignmentOnly\n";
print STDERR "allResiduesInPdb: $allResiduesInPdb\n\n";
}
return $endPos;
}
sub reverseMyString
{
my ( $string ) = @_;
my @allChars = split //, $string;
return join("", reverse @allChars );
}
sub get_loopPositions
{
my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_;
my @loopPositions = ();
my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence;
$dashRemovedAdjustedSequence =~ s/-//g;
if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb )
{
return @loopPositions;
}
my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence;
my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb;
my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 );
while ( $alignmentResiduePos <= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number
{
$alignmentResiduePos++;
next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq "-";
$pdbFileResiduePos++;
if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne
$allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] )
{
my $frontRemovedAlignmentSeq =
substr( $adjustedAlignmentSequence,
$alignmentResiduePos - 1,
length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 );
my $frontRemovedPdbResidueSeq =
substr( $allResiduesInPdb,
$pdbFileResiduePos - 1,
length($allResiduesInPdb) - $pdbFileResiduePos + 1 );
my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1;
my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1);
if ( $loopLength ne "" && $loopStartPos <= $loopEndPos )
{
push @loopPositions, "$loopStartPos-$loopEndPos";
print "\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n";
$alignmentResiduePos += $loopLength - 1;
$pdbFileResiduePos--;
}
}
}
return @loopPositions;
}
sub formatFastaSequence
{
my ($seq, $charactersPerLine) = @_;
$seq =~ s/(.{$charactersPerLine})/$1\n/g;
return $seq;
}
sub makeAlignmentFile
{
my ( $alignmentFile, $alignmentFileContent ) = @_;
open FO, ">$alignmentFile" or die "File Write Error, $alignmentFile: $!";
print FO $alignmentFileContent;
close FO;
}
sub makeTopScriptFile
{
my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_;
open FO, ">$topScriptFile" or die "File Write Error, $topScriptFile: $!";
print FO printTopScriptInvariableContentFront();
# print variable contents of the top script file
print FO "SET ALNFILE = '$alignmentFile' # alignment filename\n";
print FO "SET KNOWNS = ";
foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref )
{
print FO "'$pdbCode' ";
}
print FO " # codes of the templates\n";
print FO "SET SEQUENCE = '$targetProteinCode' # code of the target\n";
print FO printTopScriptInvariableContentEnd();
close FO;
}
sub printTopScriptInvariableContentFront
{
return "".
"# Homology modelling by the MODELLER TOP routine 'model'.\n".
"\n".
"INCLUDE # Include the predefined TOP routines\n".
"\n".
"SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file\n".
"\n";
}
sub printTopScriptInvariableContentEnd
{
return "".
"SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n".
"SET STARTING_MODEL= 1 # index of the first model\n".
"SET ENDING_MODEL = 1 # index of the last model\n".
" # (determines how many models to calculate)\n".
"CALL ROUTINE = 'model' # do homology modelling\n".
"\n";
}
</fi></fi>