Changes

Perl script for ATM

4,305 bytes added, 17:51, 12 May 2006

no edit summary

<<pre>#!/usr/bin/perl -w use strict; doMain(); sub doMain { my $baseDirPath = shift @ARGV || """" ; my $pirAlignmentFile = shift @ARGV || """"; my $targetProteinCode = shift @ARGV || """ my "; my $pdbFiles_ref = \@ARGV || (); print ""----\n""; print ""\nChecking command-line parameters ...\n"" ; printUsageAndExit() unless ( $baseDirPath && amp; -d $baseDirPath ); $baseDirPath =~ s/\/$//; # remove the ending slash if exists printUsageAndExit() unless ( $pirAlignmentFile && amp; -f ""$baseDirPath/$pirAlignmentFile" " && amp; $pirAlignmentFile =~ /\.pir$/ ); printUsageAndExit() unless ( $targetProteinCode && amp; length($targetProteinCode) == 4 ); printUsageAndExit() unless ( @$pdbFiles_ref > > 0 ); foreach my $pdbFile ( @$pdbFiles_ref ) { printUsageAndExit() unless ( -f ""$baseDirPath/$pdbFile" " & amp;& $pdbFile =~ /\.pdb$/ ); } ~~print "Making modeller~~ print "Making modeller input files ...\n"" ; makeModellerInput( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ); } sub printUsageAndExit { print STDERR ""Error!\n\n"" ; print STDERR ""Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n"" ; print STDERR " " A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n""; print STDERR " " A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n""; print STDERR " " A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n"" ; print STDERR " " A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n""; print STDERR ""----\n"" ; exit; } ~~sub makeModellerInput~~ sub makeModellerInput { my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_; print ""\tchanging the directory to $baseDirPath ...\n""; chdir $baseDirPath or die ""Dir Change Error, $baseDirPath: $!"" ; my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/; # modeller input files my $alignmentFile = $outputPrefix. "".ali""; my $topScriptFile = $outputPrefix. "".top""; my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, ""atm" " ); my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, ""pdb" " ); my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile ); # parse the target protein's alignment and information first my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} ); # parse alignments and information of known proteins and make atom files of them foreach my $pdbCode ( keys %pdbCode2PdbFile ) { my $atomFile = $pdbCode2AtomFile{ $pdbCode }; my $pdbFile = $pdbCode2PdbFile{ $pdbCode }; my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode }; # make the atom files print ""\tmaking the atom file, $atomFile ...\n"" ; makeAtomFile( $pdbFile, $atomFile ); $alignmentFileContent .= get_eachAlignmentContent( $pdbCode, $pirAlignment, $pdbFile ); } } # make the alignment file print ""\tmaking the alignment file, $alignmentFile ...\n""; makeAlignmentFile( $alignmentFile, $alignmentFileContent ); # make the top script file print ""\tmaking the top script file, $topScriptFile ...\n"" ; makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile ); ~~print "Finished~~ print "Finished!\n\n""; print ""----\n""; } sub get_pdbCode2File { my ( $pdbFiles_ref, $fileExtension ) = @_; my %pdbCode2File = (); foreach my $pdbFile ( @$pdbFiles_ref ) { { my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters my $file = $pdbCode. ""."". $fileExtension; $pdbCode2File{ $pdbCode } = $file; } return %pdbCode2File; } sub get_pdbCode2PirAlignment { my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_; my my %pdbCode2PirAlignment = (); my @pdbCodes = (); push @pdbCodes, $targetProteinCode; push @pdbCodes, (sort keys %$pdbCode2PdbFile_ref ); $/ = ""*\n""; # change the reading-block separator open FI, $pirAlignmentFile or die ""File Open Error, $pirAlignmentFile: $!\n"" ; while ( my $entry = <FIfi> ) { { chomp $entry; next if $entry =~ /^\s*$/; my @lines = split /\n/, $entry; my $pdbCode = """ "; foreach my $code ( @pdbCodes ) { if ( $lines[0] =~ /$code/ ) { $pdbCode = $code; last; } } if ( $pdbCode eq "" "" ) { print STDERR ""PDB code mistmatch with pdb files on this entry!\n""; print STDERR ""pdbCode: $pdbCode\n"" ; print STDERR ""entry: $entry\n"" ; } $pdbCode2PirAlignment{ $pdbCode } = ""$entry*\n""; print ""\nPDB code To PIR alignment found: $pdbCode =>>\n$entry\n""; } close FI; $/ = ""\n""; # restore the reading-block separator return %pdbCode2PirAlignment;} } sub getTargetProteinAlignment { my ( $targetProteinCode, $targetProteinAlignment ) = @_; my $targetProteinAlignmentContent = """" ; my @lines = split /\n/, $targetProteinAlignment; shift @lines; # remove header line shift @lines; # remove blank line my $sequenceAlignmentOnly = join ""\n"", @lines; $targetProteinAlignmentContent .= ">">P1;$targetProteinCode\n"" ; $targetProteinAlignmentContent .= ""sequence: : : : : : : : :\n""; $targetProteinAlignmentContent .= ""$sequenceAlignmentOnly\n""; return $targetProteinAlignmentContent; } sub makeAtomFile { my ( $pdbFile, $atomFile ) = @_; !system(""cp $pdbFile $atomFile"") or die ""File Copy Error: $!""; print ""\tcopying $pdbFile to $atomFile ...\n""; } sub get_eachAlignmentContent { my ( $pdbCode, $pirAlignment, $pdbFile ) = @_; my $eachAlignmentContent = """ "; my @lines = split /\n/, $pirAlignment; shift @lines; # remove header line shift @lines; # remove blank line pop @lines; # remove ending * my $sequenceAlignmentOnly = join ""\n"", @lines; $sequenceAlignmentOnly =~ s/[Xx]/-/g; my ( $structureInfoLine, $adjustedAlignmentSequence ) = get_structureInfoAndAdjustedSequence( $pdbCode, $sequenceAlignmentOnly, $pdbFile ); $ $eachAlignmentContent .= ">">P1;$pdbCode\n""; $eachAlignmentContent .= $structureInfoLine; $eachAlignmentContent .= ""$adjustedAlignmentSequence\n*\n""; return $eachAlignmentContent; } sub get_structureInfoAndAdjustedSequence { my ( $pdbCode, $sequenceAlignmentOnly, $pdbFile ) = @_; my my ( $structureInfoLine, $adjustedAlignmentSequence ) = ( """", "" "" ); my ( $startResNum, $startChain, $endResNum, $endChain ) = ("""", """", """", """"); my ( $structureXorN, $proteinName, $source, $resolution, $rFactor ) = ("""", """", """", """", """"); my $allResiduesInPdb = """" ; my $includedResNum = 0; my $isChainExist = 1; # true open FI, $pdbFile or die ""File Open Error, $pdbFile: $!\n"" ; while ( my $line = <FIfi> ) { chomp $line; if ($line =~ /^COMPND.*MOLECULE:\s*(\S.*\S);\s*$/) { $proteinName = $1; } elsif ($line =~ /^COMPND.*CHAIN:\s+NULL;\s*$/) { $isChainExist = 0; # false } elsif ($line =~ /ORGANISM_SCIENTIFIC:\s*(\S.*\S);\s*$/) { $source = $1; } elsif ($line =~ /^EXPDTA\s+(\S.*\S)\s*$/) { my $method = $1; $structureXorN = ( $method =~ /X-RAY/ ) ? ""structureX" " : ( $method =~ /NMR/ ) ? ""structureN" " : " "structure"" ; } elsif ($line =~ /^REMARK.*RESOLUTION.\s+([\d\.]+)\s+ANGSTROM.*$/) { $resolution = $1; } elsif ($line =~ /^REMARK\s+\d+\s+R VALUE\s+$WORKING SET$\s*:\s*([\d\.\-]+)\s*$/) { $rFactor = $1; } elsif ($line =~ /^ATOM/) { my @values = split /\s+/, $line; my $threeLetterAminoAcidCode = $values[3]; my $oneLetterAminoAcidCode = convertThreeToOneLetterAminoAcidCode($threeLetterAminoAcidCode); if ( $startResNum eq "" "& $quot; && $isChainExist ) { ( $startChain, $startResNum ) = ( $values[4], $values[5] ); } elsif ( $startResNum eq "" "" ) { $startResNum = $values[4]; } if ( $isChainExist ) { ( $endChain, $endResNum ) = ( $values[4], $values[5] ); } else { $endResNum = $values[4]; } if ( $endResNum ne $includedResNum ) { $allResiduesInPdb .= $oneLetterAminoAcidCode; $includedResNum = $endResNum; } } ~~elsif~~ elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read ) { last; } } close FI; $structureInfoLine = ""$structureXorN:$pdbCode:$startResNum :$startChain :$endResNum :$endChain "". " ":$proteinName :$source :$resolution :$rFactor \n""; $adjustedAlignmentSequence = get_adjustedAlignmentSequence( $sequenceAlignmentOnly, $allResiduesInPdb ); ~~return (~~ return ( $structureInfoLine, $adjustedAlignmentSequence ); } sub convertThreeToOneLetterAminoAcidCode { my ( $threeLetterAminoAcidCode ) = @_; return ( $threeLetterAminoAcidCode =~ /gly/i ) ? ""G" " : ( $threeLetterAminoAcidCode =~ /pro/i ) ? ""P" " : ( $threeLetterAminoAcidCode =~ /ala/i ) ? ""A" " : ( $threeLetterAminoAcidCode =~ /val/i ) ? ""V" " : ( $threeLetterAminoAcidCode =~ /leu/i ) ? ""L" " : ( $threeLetterAminoAcidCode =~ /ile/i ) ? ""I" " : ( $threeLetterAminoAcidCode =~ /met/i ) ? ""M" " : ( $threeLetterAminoAcidCode =~ /cys/i ) ? ""C" " : ( $threeLetterAminoAcidCode =~ /phe/i ) ? ""F" " : ( $threeLetterAminoAcidCode =~ /tyr/i ) ? ""Y" " : ( $threeLetterAminoAcidCode =~ /trp/i ) ? ""W" " : ( $threeLetterAminoAcidCode =~ /his/i ) ? ""H" " : ( $threeLetterAminoAcidCode =~ /lys/i ) ? ""K" " : ( $threeLetterAminoAcidCode =~ /arg/i ) ? ""R" " : ( $threeLetterAminoAcidCode =~ /gln/i ) ? ""Q" " : ( $threeLetterAminoAcidCode =~ /asn/i ) ? ""N" " : ( ( $threeLetterAminoAcidCode =~ /glu/i ) ? ""E" " : ( $threeLetterAminoAcidCode =~ /asp/i ) ? ""D" " : ( $threeLetterAminoAcidCode =~ /ser/i ) ? ""S" " : ( $threeLetterAminoAcidCode =~ /thr/i ) ? ""T" " : " "-""; } sub get_adjustedAlignmentSequence { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_; my $adjustedAlignmentSequence = """"; my @lines = split /\n/, $sequenceAlignmentOnly; my $charactersPerLine = length $lines[0]; $sequenceAlignmentOnly =~ s/\n//g; # remove new line characters # substitute front and end residues without atom coordinates to '-'s my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb ); my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb ); $adjustedAlignmentSequence = ""-" " x ($startPos - 1). substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based " "-" " x ( length($sequenceAlignmentOnly) - $endPos ); # substitute loop residues without atom coordinates to '-'s my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb ); foreach my $loopPosition ( @loopPositions ) { my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56 $adjustedAlignmentSequence = substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ). ~~"-"~~ "-" x ( $loopEndPos - $loopStartPos + 1). substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos ); } my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence; $dashRemovedAdjustedAlignmentSequence =~ s/-//g; if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb ) { print STDERR ""\nError!!!\n"" ; print STDERR ""Number of residues in the PIR-alignment file and the PDB file mismatches !\n"" ; print STDERR ""Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n""; print STDERR ""dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n""; print STDERR ""allResiduesInPdb: $allResiduesInPdb\n"" ; } return formatFastaSequence($adjustedAlignmentSequence, $charactersPerLine); } sub get_startPos { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_; # Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment # matching with the residues in the PDB file my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 ); my $startPos = """" ; my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly; my $pos = 0; foreach my $aa ( @allAminoAcidsInTheAlignmentSequence ) { { $pos++; next if $aa eq ""-""; my $frontRemovedSeq = substr( $sequenceAlignmentOnly, $pos - 1, length($sequenceAlignmentOnly) - $pos + 1 ); $frontRemovedSeq =~ s/-//g; # remove all dashes if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i ) { $startPos = $pos; print ""\tstart position or non-loop start position determined: $pos\n"" ; last; } } if ( $startPos eq "" "" ) { print STDERR ""\nError!\n"" ; print STDERR ""Start position mismatches!\n"" ; print STDERR ""Check the start amino acids of the alignment!\n\n"" ; print STDERR ""sequenceAlignment: $sequenceAlignmentOnly\n""; print STDERR ""residuesInPdb: $allResiduesInPdb\n\n""; } return $startPos; } sub get_endPos { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_; # Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment # # matching with the residues in the PDB file my $reversedLastTenAminoAcidsOfAllResiduesInPdb = reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 )); my $endPos = """"; my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly; my $pos = @allAminoAcidsInTheAlignmentSequence + 1; foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence ) { $pos--; next if $aa eq ""-""; my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) ); $endRemovedReverseSeq =~ s/-//g; # remove all dashes if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i ) { $endPos = $pos; print ""\tend position determined: $pos\n""; last; } } if if ( $endPos eq "" "" ) { print STDERR ""\nError!\n""; print STDERR ""End position mismatches between the alignment file and the pdb file!\n""; print STDERR ""Check the end amino acids of the alignment!\n\n"" ; print STDERR ""sequenceAlignmentOnly: $sequenceAlignmentOnly\n"" ; print STDERR ""allResiduesInPdb: $allResiduesInPdb\n\n"" ; } return $endPos; } sub reverseMyString { my ( $string ) = @_; my @allChars = split //, $string; return join("""", reverse @allChars ); } sub get_loopPositions { my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_; my @loopPositions = (); my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence; $dashRemovedAdjustedSequence =~ s/-//g; if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb ) { return @loopPositions; } my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence; my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb; my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 ); while ( $alignmentResiduePos <<= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number { $alignmentResiduePos++; next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq ""-"" ; $pdbFileResiduePos++; if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne $allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] ) { my $frontRemovedAlignmentSeq = substr( $adjustedAlignmentSequence, $alignmentResiduePos - 1, length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 ); my $frontRemovedPdbResidueSeq = substr( $allResiduesInPdb, $pdbFileResiduePos - 1, length($allResiduesInPdb) - $pdbFileResiduePos + 1 ); my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1; my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1); if if ( $loopLength ne "" "" && amp; $loopStartPos <<= $loopEndPos ) { push @loopPositions, ""$loopStartPos-$loopEndPos"" ; print ""\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n"" ; $alignmentResiduePos += $loopLength - 1; $pdbFileResiduePos--; } } } } return @loopPositions; } sub formatFastaSequence { my ($seq, $charactersPerLine) = @_; $seq =~ s/(.{$charactersPerLine})/$1\n/g; return $seq; } sub makeAlignmentFile { my ( $alignmentFile, $alignmentFileContent ) = @_; open FO, ">">$alignmentFile" " or die ""File Write Error, $alignmentFile: $!"" ; print FO $alignmentFileContent; close FO; } sub makeTopScriptFile { my my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_; open FO, ">">$topScriptFile" " or die ""File Write Error, $topScriptFile: $!""; print FO printTopScriptInvariableContentFront(); # print variable contents of the top script file print FO ""SET ALNFILE = '$alignmentFile' # alignment filename\n"" ; print FO ""SET KNOWNS = "" ; foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref ) { print FO ""'$pdbCode' "" ; } print FO " " # codes of the templates\n"" ; print FO ""SET SEQUENCE = '$targetProteinCode' # code of the target\n"" ; print FO printTopScriptInvariableContentEnd(); ~~close~~ close FO; } sub printTopScriptInvariableContentFront { return """". " "# Homology modelling by the MODELLER TOP routine 'model'.\n"". " "\n"". " "INCLUDE # Include the predefined TOP routines\n"". "\ "\n"". " "SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file\n"". " "\n""; } sub printTopScriptInvariableContentEnd { return """". " "SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n"". " "SET STARTING_MODEL= 1 # index of the first model\n"". " "SET ENDING_MODEL = 1 # index of the last model\n"". " " # (determines how many models to calculate)\n"". " "CALL ROUTINE = 'model' # do homology modelling\n"". " "\n""; } </fi></fi></pre>

Anonymous user

210.218.222.82

Changes

Perl script for ATM

Navigation menu

Views

Personal tools

Related Links

Advertisements

Search

Tools

Related Links[Edit]