Perl script for ATM
From Opengenome.net
Revision as of 09:33, 26 August 2005 by 211.211.234.134 (talk)
#!/usr/bin/perl -w use strict; doMain(); sub doMain { my $baseDirPath = shift @ARGV || ""; my $pirAlignmentFile = shift @ARGV || ""; my $targetProteinCode = shift @ARGV || ""; my $pdbFiles_ref = \@ARGV || (); print "----\n"; print "\nChecking command-line parameters ...\n"; printUsageAndExit() unless ( $baseDirPath && -d $baseDirPath ); $baseDirPath =~ s/\/$//; # remove the ending slash if exists printUsageAndExit() unless ( $pirAlignmentFile && -f "$baseDirPath/$pirAlignmentFile" && $pirAlignmentFile =~ /\.pir$/ ); printUsageAndExit() unless ( $targetProteinCode && length($targetProteinCode) == 4 ); printUsageAndExit() unless ( @$pdbFiles_ref > 0 ); foreach my $pdbFile ( @$pdbFiles_ref ) { printUsageAndExit() unless ( -f "$baseDirPath/$pdbFile" && $pdbFile =~ /\.pdb$/ ); } print "Making modeller input files ...\n"; makeModellerInput( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ); } sub printUsageAndExit { print STDERR "Error!\n\n"; print STDERR "Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n"; print STDERR " A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n"; print STDERR " A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n"; print STDERR " A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n"; print STDERR " A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n"; print STDERR "----\n"; exit; } sub makeModellerInput { my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_; print "\tchanging the directory to $baseDirPath ...\n"; chdir $baseDirPath or die "Dir Change Error, $baseDirPath: $!"; my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/; # modeller input files my $alignmentFile = $outputPrefix. ".ali"; my $topScriptFile = $outputPrefix. ".top"; my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, "atm" ); my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, "pdb" ); my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile ); # parse the target protein's alignment and information first my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} ); # parse alignments and information of known proteins and make atom files of them foreach my $pdbCode ( keys %pdbCode2PdbFile ) { my $atomFile = $pdbCode2AtomFile{ $pdbCode }; my $pdbFile = $pdbCode2PdbFile{ $pdbCode }; my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode }; # make the atom files print "\tmaking the atom file, $atomFile ...\n"; makeAtomFile( $pdbFile, $atomFile ); $alignmentFileContent .= get_eachAlignmentContent( $pdbCode, $pirAlignment, $pdbFile ); } # make the alignment file print "\tmaking the alignment file, $alignmentFile ...\n"; makeAlignmentFile( $alignmentFile, $alignmentFileContent ); # make the top script file print "\tmaking the top script file, $topScriptFile ...\n"; makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile ); print "Finished!\n\n"; print "----\n"; } sub get_pdbCode2File { my ( $pdbFiles_ref, $fileExtension ) = @_; my %pdbCode2File = (); foreach my $pdbFile ( @$pdbFiles_ref ) { my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters my $file = $pdbCode. ".". $fileExtension; $pdbCode2File{ $pdbCode } = $file; } return %pdbCode2File; } sub get_pdbCode2PirAlignment { my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_; my %pdbCode2PirAlignment = (); my @pdbCodes = (); push @pdbCodes, $targetProteinCode; push @pdbCodes, (sort keys %$pdbCode2PdbFile_ref ); $/ = "*\n"; # change the reading-block separator open FI, $pirAlignmentFile or die "File Open Error, $pirAlignmentFile: $!\n"; while ( my $entry = <FI> ) { chomp $entry; next if $entry =~ /^\s*$/; my @lines = split /\n/, $entry; my $pdbCode = ""; foreach my $code ( @pdbCodes ) { if ( $lines[0] =~ /$code/ ) { $pdbCode = $code; last; } } if ( $pdbCode eq "" ) { print STDERR "PDB code mistmatch with pdb files on this entry!\n"; print STDERR "pdbCode: $pdbCode\n"; print STDERR "entry: $entry\n"; } $pdbCode2PirAlignment{ $pdbCode } = "$entry*\n"; print "\nPDB code To PIR alignment found: $pdbCode =>\n$entry\n"; } close FI; $/ = "\n"; # restore the reading-block separator return %pdbCode2PirAlignment; } sub getTargetProteinAlignment { my ( $targetProteinCode, $targetProteinAlignment ) = @_; my $targetProteinAlignmentContent = ""; my @lines = split /\n/, $targetProteinAlignment; shift @lines; # remove header line shift @lines; # remove blank line my $sequenceAlignmentOnly = join "\n", @lines; $targetProteinAlignmentContent .= ">P1;$targetProteinCode\n"; $targetProteinAlignmentContent .= "sequence: : : : : : : : :\n"; $targetProteinAlignmentContent .= "$sequenceAlignmentOnly\n"; return $targetProteinAlignmentContent; } sub makeAtomFile { my ( $pdbFile, $atomFile ) = @_; !system("cp $pdbFile $atomFile") or die "File Copy Error: $!"; print "\tcopying $pdbFile to $atomFile ...\n"; } sub get_eachAlignmentContent { my ( $pdbCode, $pirAlignment, $pdbFile ) = @_; my $eachAlignmentContent = ""; my @lines = split /\n/, $pirAlignment; shift @lines; # remove header line shift @lines; # remove blank line pop @lines; # remove ending * my $sequenceAlignmentOnly = join "\n", @lines; $sequenceAlignmentOnly =~ s/[Xx]/-/g; my ( $structureInfoLine, $adjustedAlignmentSequence ) = get_structureInfoAndAdjustedSequence( $pdbCode, $sequenceAlignmentOnly, $pdbFile ); $eachAlignmentContent .= ">P1;$pdbCode\n"; $eachAlignmentContent .= $structureInfoLine; $eachAlignmentContent .= "$adjustedAlignmentSequence\n*\n"; return $eachAlignmentContent; } sub get_structureInfoAndAdjustedSequence { my ( $pdbCode, $sequenceAlignmentOnly, $pdbFile ) = @_; my ( $structureInfoLine, $adjustedAlignmentSequence ) = ( "", "" ); my ( $startResNum, $startChain, $endResNum, $endChain ) = ("", "", "", ""); my ( $structureXorN, $proteinName, $source, $resolution, $rFactor ) = ("", "", "", "", ""); my $allResiduesInPdb = ""; my $includedResNum = 0; my $isChainExist = 1; # true open FI, $pdbFile or die "File Open Error, $pdbFile: $!\n"; while ( my $line = <FI> ) { chomp $line; if ($line =~ /^COMPND.*MOLECULE:\s*(\S.*\S);\s*$/) { $proteinName = $1; } elsif ($line =~ /^COMPND.*CHAIN:\s+NULL;\s*$/) { $isChainExist = 0; # false } elsif ($line =~ /ORGANISM_SCIENTIFIC:\s*(\S.*\S);\s*$/) { $source = $1; } elsif ($line =~ /^EXPDTA\s+(\S.*\S)\s*$/) { my $method = $1; $structureXorN = ( $method =~ /X-RAY/ ) ? "structureX" : ( $method =~ /NMR/ ) ? "structureN" : "structure"; } elsif ($line =~ /^REMARK.*RESOLUTION.\s+([\d\.]+)\s+ANGSTROM.*$/) { $resolution = $1; } elsif ($line =~ /^REMARK\s+\d+\s+R VALUE\s+\(WORKING SET\)\s*:\s*([\d\.\-]+)\s*$/) { $rFactor = $1; } elsif ($line =~ /^ATOM/) { my @values = split /\s+/, $line; my $threeLetterAminoAcidCode = $values[3]; my $oneLetterAminoAcidCode = convertThreeToOneLetterAminoAcidCode($threeLetterAminoAcidCode); if ( $startResNum eq "" && $isChainExist ) { ( $startChain, $startResNum ) = ( $values[4], $values[5] ); } elsif ( $startResNum eq "" ) { $startResNum = $values[4]; } if ( $isChainExist ) { ( $endChain, $endResNum ) = ( $values[4], $values[5] ); } else { $endResNum = $values[4]; } if ( $endResNum ne $includedResNum ) { $allResiduesInPdb .= $oneLetterAminoAcidCode; $includedResNum = $endResNum; } } elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read ) { last; } } close FI; $structureInfoLine = "$structureXorN:$pdbCode:$startResNum :$startChain :$endResNum :$endChain ". ":$proteinName :$source :$resolution :$rFactor \n"; $adjustedAlignmentSequence = get_adjustedAlignmentSequence( $sequenceAlignmentOnly, $allResiduesInPdb ); return ( $structureInfoLine, $adjustedAlignmentSequence ); } sub convertThreeToOneLetterAminoAcidCode { my ( $threeLetterAminoAcidCode ) = @_; return ( $threeLetterAminoAcidCode =~ /gly/i ) ? "G" : ( $threeLetterAminoAcidCode =~ /pro/i ) ? "P" : ( $threeLetterAminoAcidCode =~ /ala/i ) ? "A" : ( $threeLetterAminoAcidCode =~ /val/i ) ? "V" : ( $threeLetterAminoAcidCode =~ /leu/i ) ? "L" : ( $threeLetterAminoAcidCode =~ /ile/i ) ? "I" : ( $threeLetterAminoAcidCode =~ /met/i ) ? "M" : ( $threeLetterAminoAcidCode =~ /cys/i ) ? "C" : ( $threeLetterAminoAcidCode =~ /phe/i ) ? "F" : ( $threeLetterAminoAcidCode =~ /tyr/i ) ? "Y" : ( $threeLetterAminoAcidCode =~ /trp/i ) ? "W" : ( $threeLetterAminoAcidCode =~ /his/i ) ? "H" : ( $threeLetterAminoAcidCode =~ /lys/i ) ? "K" : ( $threeLetterAminoAcidCode =~ /arg/i ) ? "R" : ( $threeLetterAminoAcidCode =~ /gln/i ) ? "Q" : ( $threeLetterAminoAcidCode =~ /asn/i ) ? "N" : ( $threeLetterAminoAcidCode =~ /glu/i ) ? "E" : ( $threeLetterAminoAcidCode =~ /asp/i ) ? "D" : ( $threeLetterAminoAcidCode =~ /ser/i ) ? "S" : ( $threeLetterAminoAcidCode =~ /thr/i ) ? "T" : "-"; } sub get_adjustedAlignmentSequence { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_; my $adjustedAlignmentSequence = ""; my @lines = split /\n/, $sequenceAlignmentOnly; my $charactersPerLine = length $lines[0]; $sequenceAlignmentOnly =~ s/\n//g; # remove new line characters # substitute front and end residues without atom coordinates to '-'s my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb ); my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb ); $adjustedAlignmentSequence = "-" x ($startPos - 1). substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based "-" x ( length($sequenceAlignmentOnly) - $endPos ); # substitute loop residues without atom coordinates to '-'s my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb ); foreach my $loopPosition ( @loopPositions ) { my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56 $adjustedAlignmentSequence = substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ). "-" x ( $loopEndPos - $loopStartPos + 1). substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos ); } my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence; $dashRemovedAdjustedAlignmentSequence =~ s/-//g; if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb ) { print STDERR "\nError!!!\n"; print STDERR "Number of residues in the PIR-alignment file and the PDB file mismatches !\n"; print STDERR "Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n"; print STDERR "dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n"; print STDERR "allResiduesInPdb: $allResiduesInPdb\n"; } return formatFastaSequence($adjustedAlignmentSequence, $charactersPerLine); } sub get_startPos { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_; # Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment # matching with the residues in the PDB file my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 ); my $startPos = ""; my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly; my $pos = 0; foreach my $aa ( @allAminoAcidsInTheAlignmentSequence ) { $pos++; next if $aa eq "-"; my $frontRemovedSeq = substr( $sequenceAlignmentOnly, $pos - 1, length($sequenceAlignmentOnly) - $pos + 1 ); $frontRemovedSeq =~ s/-//g; # remove all dashes if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i ) { $startPos = $pos; print "\tstart position or non-loop start position determined: $pos\n"; last; } } if ( $startPos eq "" ) { print STDERR "\nError!\n"; print STDERR "Start position mismatches!\n"; print STDERR "Check the start amino acids of the alignment!\n\n"; print STDERR "sequenceAlignment: $sequenceAlignmentOnly\n"; print STDERR "residuesInPdb: $allResiduesInPdb\n\n"; } return $startPos; } sub get_endPos { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_; # Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment # matching with the residues in the PDB file my $reversedLastTenAminoAcidsOfAllResiduesInPdb = reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 )); my $endPos = ""; my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly; my $pos = @allAminoAcidsInTheAlignmentSequence + 1; foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence ) { $pos--; next if $aa eq "-"; my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) ); $endRemovedReverseSeq =~ s/-//g; # remove all dashes if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i ) { $endPos = $pos; print "\tend position determined: $pos\n"; last; } } if ( $endPos eq "" ) { print STDERR "\nError!\n"; print STDERR "End position mismatches between the alignment file and the pdb file!\n"; print STDERR "Check the end amino acids of the alignment!\n\n"; print STDERR "sequenceAlignmentOnly: $sequenceAlignmentOnly\n"; print STDERR "allResiduesInPdb: $allResiduesInPdb\n\n"; } return $endPos; } sub reverseMyString { my ( $string ) = @_; my @allChars = split //, $string; return join("", reverse @allChars ); } sub get_loopPositions { my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_; my @loopPositions = (); my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence; $dashRemovedAdjustedSequence =~ s/-//g; if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb ) { return @loopPositions; } my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence; my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb; my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 ); while ( $alignmentResiduePos <= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number { $alignmentResiduePos++; next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq "-"; $pdbFileResiduePos++; if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne $allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] ) { my $frontRemovedAlignmentSeq = substr( $adjustedAlignmentSequence, $alignmentResiduePos - 1, length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 ); my $frontRemovedPdbResidueSeq = substr( $allResiduesInPdb, $pdbFileResiduePos - 1, length($allResiduesInPdb) - $pdbFileResiduePos + 1 ); my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1; my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1); if ( $loopLength ne "" && $loopStartPos <= $loopEndPos ) { push @loopPositions, "$loopStartPos-$loopEndPos"; print "\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n"; $alignmentResiduePos += $loopLength - 1; $pdbFileResiduePos--; } } } return @loopPositions; } sub formatFastaSequence { my ($seq, $charactersPerLine) = @_; $seq =~ s/(.{$charactersPerLine})/$1\n/g; return $seq; } sub makeAlignmentFile { my ( $alignmentFile, $alignmentFileContent ) = @_; open FO, ">$alignmentFile" or die "File Write Error, $alignmentFile: $!"; print FO $alignmentFileContent; close FO; } sub makeTopScriptFile { my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_; open FO, ">$topScriptFile" or die "File Write Error, $topScriptFile: $!"; print FO printTopScriptInvariableContentFront(); # print variable contents of the top script file print FO "SET ALNFILE = '$alignmentFile' # alignment filename\n"; print FO "SET KNOWNS = "; foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref ) { print FO "'$pdbCode' "; } print FO " # codes of the templates\n"; print FO "SET SEQUENCE = '$targetProteinCode' # code of the target\n"; print FO printTopScriptInvariableContentEnd(); close FO; } sub printTopScriptInvariableContentFront { return "". "# Homology modelling by the MODELLER TOP routine 'model'.\n". "\n". "INCLUDE # Include the predefined TOP routines\n". "\n". "SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file\n". "\n"; } sub printTopScriptInvariableContentEnd { return "". "SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n". "SET STARTING_MODEL= 1 # index of the first model\n". "SET ENDING_MODEL = 1 # index of the last model\n". " # (determines how many models to calculate)\n". "CALL ROUTINE = 'model' # do homology modelling\n". "\n"; }