BioPHP: PHP for Biocomputing

Last updated: May 17, 2003 (Saturday)

Welcome Message top

Source Code Listing for seqalign.inc

<?php
require_once("seq.inc");

/* 
The SeqAlign class represents the result of an alignment performed by various third-party 
software such as ClustalW.  The alignment is usually found in a file that uses a particular 
format. Right now, my code supports only FASTA and CLUSTAL formats.
   
SeqAlign properties and methods allow users to perform post-alignment operations, etc. 
*/
class SeqAlign
{ // OPENS class SeqAlign
var $length;
var $seq_count;
var $gap_count;
var $seqset;
var $seqptr;
var $is_flush;

// sort_alpha rearranges the sequences in an alignment set alphabetically by their sequence id.
// In addition, you can specify if you wish it be in ascending or descending order via $option.

function sort_alpha($option = "ASC")
   {
   $temp = array();
   foreach($this->seqset as $seqitem)
      {
      $key = $seqitem->id . str_pad($seqitem->start, 9, "0", STR_PAD_LEFT);
      $temp[$key] = $seqitem;
      }

   $option = strtoupper($option);
   if ($option == "ASC") asort($temp);
   elseif ($option == "DESC") arsort($temp);
   else die("Invalid argument #1 passed to SORT_ALPHA() method!");

   $temp2 = array();
   foreach($temp as $key => $value)
      $temp2[] = $value;
   $this->seqset = $temp2;
   }

// first() moves the sequence pointer to the first sequence in the alignment set.
function first()
   {
   $this->seqptr = 0;
   }

// last() moves the sequence pointer to the last sequence in the alignment set.
function last()
   {
   $this->seqptr = $this->seq_count - 1;
   }

// prev() moves the sequence pointer to the sequence before the current one.
function prev()
   {
   if ($this->seqptr > 0) $this->seqptr--;
   }

// next() moves the sequence pointer to the sequence after the current one.
function next()
   {
   if ($this->seqptr < $this->seq_count-1) $this->seqptr++;
   }

// fetch() retrieves a particular sequence (identified by its index number) from an alignment set.
function fetch($index = "")
   {
   if (strlen($index) == 0) $index = $this->seqptr;
   return $this->seqset[$index];
   }

// get_length() returns the lenght of the longest sequence in an alignment set.
function get_length()
   {
   $maxlen = 0;
   foreach($this->seqset as $seqitem)
      if ($seqitem->length > $maxlen) $maxlen = $seqitem->length;
   return $maxlen;
   }

// get_gap_count() counts the number of gaps ("-") found in all sequences in an alignment set. 
function get_gap_count()
   {
   $gapctr = 0;
   foreach($this->seqset as $seqitem)
      $gapctr += $seqitem->symfreq("-");
   return $gapctr;
   }

// get_is_flush() tests if all the sequences in an alignment set have the same length.
function get_is_flush()
   {
   $samelength = TRUE;
   $ctr = 0;
   foreach($this->seqset as $element)
      {
      $ctr++;
      $currlen = $element->seqlen();
      if ($ctr == 1)
         {
         $prevlen = $currlen;
         continue;
         }
      if ($currlen != $prevlen)
         {
         $samelength = FALSE;
         break;
         }
      $prevlen = $currlen;
      }
   return $samelength;
   }

// char_at_res() returns the character found at a given residue number in a given sequence.
function char_at_res($seqidx, $res)
   {
   $seqobj = $this->seqset[$seqidx];
   if ($res > $seqobj->end) return FALSE;
   if ($res < $seqobj->start) return FALSE;

   $len = $seqobj->seqlen();
   $nongap_count = $res - $seqobj->start + 1;
   $nongap_ctr = 0;
   for($col = 0; $col < $len; $col++)
      {
      $currlet = substr($seqobj->sequence, $col, 1);
      if ($currlet == "-") continue;
      else
         {
         $nongap_ctr++;
         if ($nongap_ctr == $nongap_count) return $currlet;
         }
      }
   }

// substr_bw_res() gets the substring between two residues in a sequence that is part of an alignment set.
function substr_bw_res($seqidx, $res_beg, $res_end = "")
   {
   $seqobj = $this->seqset[$seqidx];
   // Later, you can return a code which identifies the type of error.
   if ($res_end > $seqobj->end) return FALSE;
   if ($res_beg < $seqobj->start) return FALSE;
   if ( (gettype($res_end) == "string") and (strlen($res_end) == 0) )
      $res_end = $seqobj->end;

   $res_begctr = $res_beg - $seqobj->start + 1;
   $res_endctr = $res_end - $seqobj->start + 1;

   $len = $seqobj->seqlen();
   $nongap_ctr = 0;
   $subseq = "";
   for($col = 0; $col < $len; $col++)
      {
      $currlet = substr($seqobj->sequence, $col, 1);
      if ($currlet != "-") $nongap_ctr++;
      if ( ($nongap_ctr >= $res_begctr) and
         ($nongap_ctr <= $res_endctr) )
         $subseq .= $currlet;
      elseif ($nongap_ctr > $res_endctr) break;
      }
   return $subseq;
   }

// col2res() converts a column number to a residue number in a sequence that is part of an alignment set.
function col2res($seqidx, $col)
   {
   $seqobj = $this->seqset[$seqidx];
   // Later, you can return a code which identifies the type of error.
   if ($col > $seqobj->seqlen() - 1) return FALSE;
   if ($col < 0) return FALSE;

   $nongap_ctr = 0;
   for($i = 0; $i <= $col; $i++)
      {
      $currlet = substr($seqobj->sequence, $i, 1);

      if ($currlet != "-") $nongap_ctr++;
      }
   if ($currlet == "-") return "-";
   else return ($seqobj->start + $nongap_ctr - 1);
   }

// res2col() converts a residue number to a column number in a sequence in an alignment set.
function res2col($seqidx, $res)
   {
   $seqobj = $this->seqset[$seqidx];
   // Later, you can return a code which identifies the type of error.
   if ($res > $seqobj->end) return FALSE;
   if ($res < $seqobj->start) return FALSE;

   $len = $seqobj->seqlen();
   $nongap_count = $res - $seqobj->start + 1;
   $nongap_ctr = 0;
   for($col = 0; $col < $len; $col++)
      {
      $currlet = substr($seqobj->sequence, $col, 1);
      if ($currlet == "-") continue;
      else
         {
         $nongap_ctr++;
         if ($nongap_ctr == $nongap_count) return $col;
         }
      }
   }

// subalign() returns a subset of consecutive sequences in an alignment set.
function subalign($beg, $end)
   {
   if (($beg < 0) or ($end < 0)) die("Invalid argument passed to SUBALIGN() method!");
   if (($beg > $this->seq_count-1) or ($end > $this->seq_count-1))
      die("Invalid argument passed to SUBALIGN() method!");

   $new_seqset = array();
   $new_align = new SeqAlign();
   $new_align->seqset = array_slice($this->seqset, $beg, $end-$beg+1);
   $new_align->length = $new_align->get_length();
   $new_align->seq_count = $end - $beg + 1;
   $new_align->gap_count = $new_align->get_gap_count();
   $new_align->seqptr = 0;
   $new_align->is_flush = $new_align->get_is_flush();
   return $new_align;
   }

// select() returns a set of (possibly non-consecutive) sequences in an alignment set.
function select()
   {
   $arglist = func_get_args();
   if (count($arglist) == 0) die("Must pass at least one argument to SELECT() method!");

   $new_seqset = array();
   $new_align = new SeqAlign();
   $ctr = 0;
   foreach($arglist as $seqindex)
      {
      $new_seqset[] = $this->seqset[$seqindex];
      $ctr++;
      }
   $new_align->seqset = $new_seqset;
   $new_align->length = $new_align->get_length();
   $new_align->seq_count = count($arglist);
   $new_align->gap_count = $new_align->get_gap_count();
   $new_align->seqptr = 0;
   $new_align->is_flush = $new_align->get_is_flush();
   return $new_align;
   }

// res_var() identifies the positions of variant and invariant (conserved) residues in an alignment set.
function res_var($threshold = 100){
   // for now, assume all the sequences are equal in length.
   $alphabet = array('A','B','C','D','E','F','G','H','I','J','K','L','M',
                     'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');

   $all_pos = array();
   $invar_pos = array();
   $var_pos = array();
   $firstseq = $this->seqset[0];
   $seqlength = strlen($firstseq->sequence);

   $globfreq = array();
   for($i = 0; $i < count($alphabet); $i++)
      {
      $currlet = $alphabet[$i];
      $globfreq[$currlet] = 0;
      }

   for($i = 0; $i < $seqlength; $i++)
      {
      $freq = $globfreq;
      for($j = 0; $j < $this->seq_count; $j++)
         {
         $currseq = $this->seqset[$j];
         $currlet = substr($currseq->sequence, $i, 1);
         $freq[$currlet]++;
         }
      arsort($freq);
      list($key, $value) = each($freq);
      $maxpercent = ($value/$this->seq_count) * 100;
      if ($maxpercent >= $threshold) array_push($invar_pos, $i);
      else array_push($var_pos, $i);
      }
   $all_pos["INVARIANT"] = $invar_pos;
   $all_pos["VARIANT"] = $var_pos;
   return $all_pos;
   }

// consensus() returns the consensus string for an alignment set.  See technical reference for details.
function consensus($threshold = 100)
   {
   // for now, assume all the sequences are equal in length.
   $alphabet = array('A','B','C','D','E','F','G','H','I','J','K','L','M',
                     'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');

   $resultstr = "";
   $firstseq = $this->seqset[0];
   $seqlength = strlen($firstseq->sequence);

   $globfreq = array();
   for($i = 0; $i < count($alphabet); $i++)
      {
      $currlet = $alphabet[$i];
      $globfreq[$currlet] = 0;
      }

   for($i = 0; $i < $seqlength; $i++)
      {
      $freq = $globfreq;
      for($j = 0; $j < $this->seq_count; $j++)
         {
         $currseq = $this->seqset[$j];
         $currlet = substr($currseq->sequence, $i, 1);
         $freq[$currlet]++;
         }
      arsort($freq);
      list($key, $value) = each($freq);
      $maxpercent = ($value/$this->seq_count) * 100;
      if ($maxpercent >= $threshold) $resultstr = $resultstr . $key;
      else $resultstr = $resultstr . "?";
      }
   return $resultstr;
   }

// add_seq() adds a sequence to an alignment set.
function add_seq($seqobj)
   {
   if (gettype($seqobj) == "object")
      {
      array_push($this->seqset, $seqobj);
      if ($seqobj->seqlen() > $this->length) $this->length = $seqobj->seqlen();
      $this->gap_count += $seqobj->symfreq("-");
      if ($seqobj->seqlen() > $this->length) $this->length = $seqobj->seqlen();

      if ($this->is_flush == TRUE)
         {
         if ($this->seq_count >= 1)
            {
            $firstseq = $this->seqset[0];
            if ($seqobj->seqlen() != $firstseq->seqlen()) $this->is_flush = FALSE;
            }
         }
      $this->seq_count++;
      return count($this->seqset);
      }
   elseif (gettype($seqobj) == "string")
      {
      print "NOT YET OPERATIONAL";
      }
   }

// del_seq() deletes or removes a sequence from an alignment set.
function del_seq($seqobj)
   {
   $seqid = $seqobj;
   $tempset = array();
   foreach($this->seqset as $element)
      {
      if ($element->id != $seqid) array_push($tempset, $element);
      else $removed_seq = $element;
      }
   // Updates the value of the SEQSET property of the SEQALIGN object.
   $this->seqset = $tempset;
   // Updates the value of the SEQ_COUNT property of the SEQALIGN object.
   $this->seq_count--;
   // Updates the value of the LENGTH property of the SEQALIGN object.
   if ($removed_seq->seqlen() == $this->length)
      {
      $maxlen = 0;
      foreach($this->seqset as $element)
         if ($element->seqlen() > $maxlen) $maxlen = $element->seqlen();
      $this->length = $maxlen;
      }
   // Updates the value of the GAP_COUNT property of the SEQALIGN object.
   $this->gap_count -= $removed_seq->symfreq("-");
   // Updates the value of the IS_FLUSH property of the SEQALIGN object.
   if ($this->is_flush == FALSE)
      {
      // Take note that seq_count has already been decremented in the code above.
      if ($this->seq_count <= 1) $this->is_flush = TRUE;
      else
         {
         $samelength = TRUE;
         $ctr = 0;
         foreach($this->seqset as $element)
            {
            $ctr++;
            $currlen = $element->seqlen();
            if ($ctr == 1)
               {
               $prevlen = $currlen;
               continue;
               }
            if ($currlen != $prevlen)
               {
               $samelength = FALSE;
               break;
               }
            $prevlen = $currlen;
            }
         if ($samelength == TRUE) $this->is_flush = TRUE;
         }
      }
   // Return the new number of sequences in the alignment set AFTER delete operation.
   return count($this->seqset);
   }

// SeqAlign() is the constructor method for the SeqAlign class.  It initializes class properties.
function SeqAlign($filename = "", $format = "FASTA")
   { // OPENS function SeqAlign
   if (strlen($filename) == 0)
      {
      $this->seq_count = 0;
      $this->length = 0;
      $this->seqptr = 0;
      $this->gap_count = 0;
      $this->is_flush = TRUE;
      $this->seqset = array();
      return;
      }

   if ($format == "FASTA")
      {
      $flines = file($filename);
      $seqctr = 0;
      $maxlen = 0;
      $maxctr = 0;
      $gapctr = 0;
      $this->seqset = array();
      $samelength = TRUE;
   
      while ( list($no, $linestr) = each($flines) )
         { // OPENS while ( list($no, $linestr) = each($flines) )
         if (substr($linestr, 0, 1) == ">")
            { // start of a new sequence
            $seqctr++;
            $seqlen = strlen($seqstr);
   
            $seq_obj = new seq();
            $seq_obj->id = $prev_id;
            $seq_obj->length = $seqlen;
            $seq_obj->sequence = $seqstr;
            $seq_obj->start = $prev_start;
            $seq_obj->end = $prev_end;
            $localgaps = $seq_obj->symfreq("-");
            $gapctr += $seq_obj->symfreq("-");
   
            if ($seqctr > 1)
               {
               if ($seqlen > $maxlen) $maxlen = $seqlen;
               if (($seqctr >= 3) and ($seqlen != $prev_len))
                  $samelength = FALSE;
               array_push($this->seqset, $seq_obj);
               }
            $seqstr = "";
   
            $words = preg_split("/[\>\/]/", substr($linestr, 1));
            $prev_id = $words[0];
   
            $indexes = preg_split("/-/", $words[1]);
            $prev_start = $indexes[0];
            $prev_end = $indexes[1];
            $prev_len = $seqlen;
            continue;
            }
         else
            {
            $seqstr = $seqstr . trim($linestr);
            }
         } // CLOSES while ( list($no, $linestr) = each($flines) )
   
         $seqlen = strlen($seqstr);
         $seq_obj = new seq();
         $seq_obj->id = $prev_id;
         $seq_obj->start = $prev_start;
         $seq_obj->end = $prev_end;
         $seq_obj->length = $seqlen;
         $seq_obj->sequence = $seqstr;
         $localgaps = $seq_obj->symfreq("-");
         $gapctr += $seq_obj->symfreq("-");
   
         if ($seqctr > 1)
            {
 				if ($seqlen > $maxlen) $maxlen = $seqlen;
            if (($seqctr >= 3) and ($seqlen != $prev_len))
               $samelength = FALSE;
            array_push($this->seqset, $seq_obj);
            }
   
      $this->seq_count = $seqctr;
      $this->length = $maxlen;
      $this->seqptr = 0;
      $this->gap_count = $gapctr;
      $this->is_flush = $samelength;
      } // CLOSES if ($format == "FASTA")
   elseif ($format == "CLUSTAL")
      { // OPENS elseif ($format == "CLUSTAL")
      $flines = file($filename);
      $namelist = array();
      $conserve_line = "";
      $linectr = 0;
      while( list($no, $linestr) = each($flines) )
         { // OPENS while( list($no, $linestr) = each($flines) )
         $linectr++;
         if ($linectr == 1) continue; // skip the first line.
         if (strlen(trim($linestr)) == 0) continue; // ignore blank lines.
      
         $seqname = trim(substr($linestr, 0, 16));
         $seqline = substr($linestr, 16, 60);
      
         if (strlen(trim($seqname)) == 0)
            {
            $conserve_line .= substr($seqline, 0, $lastlen);
            continue;
            }
         if (in_array($seqname, $namelist) == FALSE)
            {
            $namelist[] = $seqname;
            $seq[$seqname] = $seqline;
            $lastlen = strlen(trim($seqline));
            }
         else
            {
            $seq[$seqname] .= trim($seqline);
            $lastlen = strlen(trim($seqline));
            }
         } // CLOSES while( list($no, $linestr) = each($flines) )

         $this->seqset = array();
         $gapctr = 0;
         foreach($seq as $key => $value)
            {
            $seq_obj = new seq();
            $seq_obj->id = $key;
            $seq_obj->length = strlen($value);
            $seq_obj->sequence = $value;
            $seq_obj->start = 0;
            $seq_obj->end = $seq_obj->length - 1;
            $gapctr += $seq_obj->symfreq("-");
            array_push($this->seqset, $seq_obj);
            }
         $this->seq_count = count($namelist);
         $this->length = strlen($conserve_line);
         $this->seqptr = 0;
         $this->gap_count = $gapctr;
         $this->is_flush = TRUE;
      } // CLOSES elseif ($format == "CLUSTAL")
   } // CLOSES function SeqAlign
} // CLOSES class SeqAlign
?>



Back to Top.
Back to Home Page.

 


Copyright © 2003 by Sergio Gregorio, Jr.
All rights reserved.