sub printUsageAndExit { print STDERR "Error!\n\n"; print STDERR "Usage: perl -w $0 A1_baseDirectoryPath A2_pirAlignmentFile A3_targetProteinCode A4_pdbFiles\n\n"; print STDERR " A1_baseDirectoryPath: directory path containing all the PIR alignment file and pdb files\n\n"; print STDERR " A2_pirAlignmentFile: PIR-format output file (with the extension, '.pir') of multiple sequence alignment including a four-letter code in each header line (e.g. cdk2HumanHomologs)\n\n"; print STDERR " A3_targetProteinCode: the same four-letter code of your target protein as that in the alignment file (e.g. trgt or mine)\n\n"; print STDERR " A4_pdbFiles: pdb files (with the extension, '.pdb') separated by blank of which file name consists of the same four-letter code as that in the alignment file (e.g. 1erk.pdb 1p38.pdb)\n\n"; print STDERR "----\n"; exit; }
sub makeModellerInput { my ( $baseDirPath, $pirAlignmentFile, $targetProteinCode, $pdbFiles_ref ) = @_; print "\tchanging the directory to $baseDirPath ...\n"; chdir $baseDirPath or die "Dir Change Error, $baseDirPath: $!";
my ( $outputPrefix ) = $pirAlignmentFile =~ /^(\S+)\.\S+$/;
# modeller input files my $alignmentFile = $outputPrefix. ".ali"; my $topScriptFile = $outputPrefix. ".top"; my %pdbCode2AtomFile = get_pdbCode2File( $pdbFiles_ref, "atm" );
my %pdbCode2PdbFile = get_pdbCode2File( $pdbFiles_ref, "pdb" ); my %pdbCode2PirAlignment = get_pdbCode2PirAlignment( $pirAlignmentFile, $targetProteinCode, \%pdbCode2PdbFile );
# parse the target protein's alignment and information first my $alignmentFileContent = getTargetProteinAlignment( $targetProteinCode, $pdbCode2PirAlignment{$targetProteinCode} );
# parse alignments and information of known proteins and make atom files of them foreach my $pdbCode ( keys %pdbCode2PdbFile ) { my $atomFile = $pdbCode2AtomFile{ $pdbCode }; my $pdbFile = $pdbCode2PdbFile{ $pdbCode }; my $pirAlignment = $pdbCode2PirAlignment{ $pdbCode };
# make the atom files print "\tmaking the atom file, $atomFile ...\n"; makeAtomFile( $pdbFile, $atomFile );
# make the alignment file print "\tmaking the alignment file, $alignmentFile ...\n"; makeAlignmentFile( $alignmentFile, $alignmentFileContent );
# make the top script file print "\tmaking the top script file, $topScriptFile ...\n"; makeTopScriptFile( $topScriptFile, $alignmentFile, $targetProteinCode, \%pdbCode2AtomFile );
print "Finished!\n\n"; print "----\n"; }
sub get_pdbCode2File { my ( $pdbFiles_ref, $fileExtension ) = @_; my %pdbCode2File = ();
foreach my $pdbFile ( @$pdbFiles_ref ) { my ( $pdbCode ) = $pdbFile =~ /(\w{4})\.pdb$/; # extract only the four-letters my $file = $pdbCode. ".". $fileExtension; $pdbCode2File{ $pdbCode } = $file; }
return %pdbCode2File; }
sub get_pdbCode2PirAlignment { my ( $pirAlignmentFile, $targetProteinCode, $pdbCode2PdbFile_ref ) = @_; my %pdbCode2PirAlignment = ();
if ( $endResNum ne $includedResNum ) { $allResiduesInPdb .= $oneLetterAminoAcidCode; $includedResNum = $endResNum; } } elsif ($line =~ /^TER/) # read only the atom list ( if multi-chain protein, only the first chain is read ) { last; } } close FI;
sub get_adjustedAlignmentSequence { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_; my $adjustedAlignmentSequence = "";
my @lines = split /\n/, $sequenceAlignmentOnly; my $charactersPerLine = length $lines[0]; $sequenceAlignmentOnly =~ s/\n//g; # remove new line characters
# substitute front and end residues without atom coordinates to '-'s my $startPos = get_startPos( $sequenceAlignmentOnly, $allResiduesInPdb ); my $endPos = get_endPos( $sequenceAlignmentOnly, $allResiduesInPdb ); $adjustedAlignmentSequence = "-" x ($startPos - 1). substr( $sequenceAlignmentOnly, $startPos - 1, $endPos - $startPos + 1 ). # substr is zero-based "-" x ( length($sequenceAlignmentOnly) - $endPos );
# substitute loop residues without atom coordinates to '-'s my @loopPositions = get_loopPositions( $adjustedAlignmentSequence, $allResiduesInPdb ); foreach my $loopPosition ( @loopPositions ) { my ( $loopStartPos, $loopEndPos ) = split /-/, $loopPosition; # e.g. loop residues 45-56 $adjustedAlignmentSequence = substr( $adjustedAlignmentSequence, 0, $loopStartPos - 1 ). "-" x ( $loopEndPos - $loopStartPos + 1). substr( $adjustedAlignmentSequence, $loopEndPos, length($adjustedAlignmentSequence) - $loopEndPos ); }
my $dashRemovedAdjustedAlignmentSequence = $adjustedAlignmentSequence; $dashRemovedAdjustedAlignmentSequence =~ s/-//g;
if ( $dashRemovedAdjustedAlignmentSequence ne $allResiduesInPdb ) { print STDERR "\nError!!!\n"; print STDERR "Number of residues in the PIR-alignment file and the PDB file mismatches !\n"; print STDERR "Substitue residues in the PIR-alignment file without atom coordinates to dashes('-'s) manually!\n\n"; print STDERR "dashRemovedAdjustedAlignmentSequence: $dashRemovedAdjustedAlignmentSequence\n"; print STDERR "allResiduesInPdb: $allResiduesInPdb\n"; }
sub get_startPos { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
# Hypothesis: if ten residues match in the front end, we can determine the start position in the alignment # matching with the residues in the PDB file my $firstTenAminoAcidsOfAllResiduesInPdb = substr( $allResiduesInPdb, 0, 10 );
my $startPos = ""; my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly; my $pos = 0;
foreach my $aa ( @allAminoAcidsInTheAlignmentSequence ) { $pos++; next if $aa eq "-";
if ( $frontRemovedSeq =~ /^$firstTenAminoAcidsOfAllResiduesInPdb/i ) { $startPos = $pos; print "\tstart position or non-loop start position determined: $pos\n"; last; } }
if ( $startPos eq "" ) { print STDERR "\nError!\n"; print STDERR "Start position mismatches!\n"; print STDERR "Check the start amino acids of the alignment!\n\n"; print STDERR "sequenceAlignment: $sequenceAlignmentOnly\n"; print STDERR "residuesInPdb: $allResiduesInPdb\n\n"; }
return $startPos; }
sub get_endPos { my ( $sequenceAlignmentOnly, $allResiduesInPdb ) = @_;
# Hypothesis: if ten residues match in the rear end, we can determine the end position in the alignment # matching with the residues in the PDB file my $reversedLastTenAminoAcidsOfAllResiduesInPdb = reverseMyString(substr( $allResiduesInPdb, length($allResiduesInPdb) - 10, 10 ));
my $endPos = ""; my @allAminoAcidsInTheAlignmentSequence = split //, $sequenceAlignmentOnly; my $pos = @allAminoAcidsInTheAlignmentSequence + 1;
foreach my $aa ( reverse @allAminoAcidsInTheAlignmentSequence ) { $pos--; next if $aa eq "-";
my $endRemovedReverseSeq = reverseMyString( substr( $sequenceAlignmentOnly, 0, $pos ) ); $endRemovedReverseSeq =~ s/-//g; # remove all dashes
if ( $endRemovedReverseSeq =~ /^$reversedLastTenAminoAcidsOfAllResiduesInPdb/i ) { $endPos = $pos; print "\tend position determined: $pos\n"; last; } }
if ( $endPos eq "" ) { print STDERR "\nError!\n"; print STDERR "End position mismatches between the alignment file and the pdb file!\n"; print STDERR "Check the end amino acids of the alignment!\n\n"; print STDERR "sequenceAlignmentOnly: $sequenceAlignmentOnly\n"; print STDERR "allResiduesInPdb: $allResiduesInPdb\n\n"; }
return $endPos; }
sub reverseMyString { my ( $string ) = @_; my @allChars = split //, $string;
return join("", reverse @allChars ); }
sub get_loopPositions { my ( $adjustedAlignmentSequence, $allResiduesInPdb ) = @_; my @loopPositions = ();
my $dashRemovedAdjustedSequence = $adjustedAlignmentSequence; $dashRemovedAdjustedSequence =~ s/-//g;
if ( $dashRemovedAdjustedSequence eq $allResiduesInPdb ) { return @loopPositions; }
my @allAminoAcidsInTheAlignmentSequence = split //, $adjustedAlignmentSequence; my @allAminoAcidsInThePdbFile = split //, $allResiduesInPdb; my ( $alignmentResiduePos, $pdbFileResiduePos ) = ( 0, 0 );
while ( $alignmentResiduePos <= $#allAminoAcidsInTheAlignmentSequence ) # '$#' means the zero-based last entry number { $alignmentResiduePos++; next if $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] eq "-"; $pdbFileResiduePos++;
if ( $allAminoAcidsInTheAlignmentSequence[$alignmentResiduePos-1] ne $allAminoAcidsInThePdbFile[$pdbFileResiduePos - 1] ) { my $frontRemovedAlignmentSeq = substr( $adjustedAlignmentSequence, $alignmentResiduePos - 1, length($adjustedAlignmentSequence) - $alignmentResiduePos + 1 ); my $frontRemovedPdbResidueSeq = substr( $allResiduesInPdb, $pdbFileResiduePos - 1, length($allResiduesInPdb) - $pdbFileResiduePos + 1 );
my $loopLength = get_startPos($frontRemovedAlignmentSeq, $frontRemovedPdbResidueSeq) - 1; my ( $loopStartPos, $loopEndPos ) = ( $alignmentResiduePos, $alignmentResiduePos + $loopLength - 1);
if ( $loopLength ne "" && $loopStartPos <= $loopEndPos ) { push @loopPositions, "$loopStartPos-$loopEndPos"; print "\tloop pisition found: $loopStartPos-$loopEndPos, length=$loopLength\n";
sub formatFastaSequence { my ($seq, $charactersPerLine) = @_; $seq =~ s/(.{$charactersPerLine})/$1\n/g; return $seq; }
sub makeAlignmentFile { my ( $alignmentFile, $alignmentFileContent ) = @_;
open FO, ">$alignmentFile" or die "File Write Error, $alignmentFile: $!"; print FO $alignmentFileContent; close FO; }
sub makeTopScriptFile { my ( $topScriptFile, $alignmentFile, $targetProteinCode, $pdbCode2AtomFile_ref ) = @_;
open FO, ">$topScriptFile" or die "File Write Error, $topScriptFile: $!"; print FO printTopScriptInvariableContentFront();
# print variable contents of the top script file print FO "SET ALNFILE = '$alignmentFile' # alignment filename\n"; print FO "SET KNOWNS = "; foreach my $pdbCode ( sort keys %$pdbCode2AtomFile_ref ) { print FO "'$pdbCode' "; } print FO " # codes of the templates\n"; print FO "SET SEQUENCE = '$targetProteinCode' # code of the target\n";
print FO printTopScriptInvariableContentEnd(); close FO; }
sub printTopScriptInvariableContentFront { return "". "# Homology modelling by the MODELLER TOP routine 'model'.\n". "\n". "INCLUDE # Include the predefined TOP routines\n". "\n". "SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file\n". "\n"; }
sub printTopScriptInvariableContentEnd { return "". "SET ATOM_FILES_DIRECTORY = './' # directories for input atom files\n". "SET STARTING_MODEL= 1 # index of the first model\n". "SET ENDING_MODEL = 1 # index of the last model\n". " # (determines how many models to calculate)\n". "CALL ROUTINE = 'model' # do homology modelling\n". "\n"; }