BioPHP: PHP for Biocomputing


Source Code Listing for etc.inc
Last updated: April 6, 2003

<?php
/* ETC.INC contains definitions for the SubMatrix and SeqMatch classes.
   It also contains helper functions such as is_blank(), isa_qualifier(), 
   firstchar(), left(), etc. */

$patterndb = array("_StartCodon" => "AUG", "_EndCodon" => "[UAA,UAG,UGA]");

// trim_element() removes leading and trailing spaces from a string.  In conjunction 
// with the array_walk() function, it removes spaces from each element of an array.
function trim_element(&$value, $key)
   {
   $value = trim($value);
   }


/* 
is_false() tests if a value is a boolean false and not a zero (0).  This is necessary 
to correctly interpret the return value of some PHP functions like strpos().  strpos()
returns a zero (0) if a string is found at the beginning of a larger string, and FALSE
if it cannot find that string within the larger string.  In PHP, FALSE equals 0. 
*/
function is_false($value)
   {
   if ( (gettype($value) == "boolean") and
      ($value == FALSE) ) return TRUE;
   else return FALSE;
   }

// rem_right() removes $charcount characters from the right (end) of a string.
function rem_right($str, $charcount = 1)
   {
   return substr($str, 0, strlen($str)-$charcount);
   }

// intrim() removes "internal spaces" (as opposed to leading and trailing spaces) from a string.
function intrim($string)
   {
   return eregi_replace(' ', '', $string);
   }

// getmin() gets the minimum of three (usually numeric) values $x, $y, and $z.
// For now, this can't handle situations when one or more arguments is FALSE.
function getmin($x, $y, $z)
   {
   if ($x < $y)
      if ($x < $z) return $x;
      else return $z;
   else
      if ($y < $z) return $y;
      else return $z;
   }

// is_even() tests if an integer is an even number.
function is_even($integer)
   {
   if (($integer/2) == ((int) ($integer/2))) return TRUE;
   else return FALSE;
   }

// is_odd() tests if an integer is an odd number.  This is the opposite of is_even().
function is_odd($integer)
   {
   if (($integer/2) != ((int) ($integer/2))) return TRUE;
   else return FALSE;
   }

// is_blankstr() tests if a value is a blank string ("").  Like is_false(), this
// helps interpret the value of some PHP functions or expressions.
function is_blankstr($var)
   {
   if ( (gettype($var) == "string") and ($var == "") ) return TRUE;
   else return FALSE;
   }

// I think this function should give way to or be replaced by is_blankstr().
// I haven't removed this yet as I have to check if some code still uses it.
function is_blank($str)
   {
   if ($str == "") return true;
   else return false;
   }

// firstchar() returns the first or beginning character of a string.
function firstchar($str)
   {
   return left($str, 1);
   }

// left() returns the first $numchars characters of a string.
function left($str, $numchars)
   {
   return substr($str, 0, $numchars);
   }

// right() returns the substring beginning at $numchars characters from the right end of a string.
function right($str, $numchars)
   {
   return substr($str, strlen($str)-$numchars);
   }

// compare_letter() compares two letters $let1 and $let2 and returns another letter
// indicating if the two were exact matches, partial matches, or non-matches.
function compare_letter($let1, $let2, $matrix, $equal, $partial = "+", $nomatch = ".")
	{
	global $chemgrp_matrix;

	// if no custom substitution matrix was provided, use the default.
	if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
	
	// if no symbol for exact matches was provided, use the residue symbol.
	if (isset($equal) == FALSE) $equal = $let1;
	
	if ($let1 == $let2) return $equal;
	elseif (partial_match($let1, $let2, $matrix)) return $partial;
	else return $nomatch;
	}


/* Algorithm:
   We abbreviate substitution matrix to "submatrix".  Each element in a submatrix is an array of
   symbols that are considered "partial matches" of each other.
	
   Default submatrix: ( (G,A,V,L,I), (S,T), (N,Q), (F,Y,W), (C, M), (P), (D,E), (K,R,H), (*), (X) )

   1) Check if both $let1 and $let2 appear in the first element (G,A,V,L,I) of the substitution matrix.

   2) If they are, you've found a "hit", and $let1 and $let2 are partial matches.  Return a TRUE value.
      If they are not, then go to the next element in the substitution matrix.  
		
      Repeat steps 1 and 2 until you reach a submatrix element where both $let1 and $let2 appear, or 
      until the last element in the submatrix has been checked.
		
   3) If you reach the last submatrix element without a "hit", return a FALSE value.
	
   NOTE: This will not warn if you $let1 and/or $let2 is nowhere to be found in the whole submatrix.		
*/
function partial_match($let1, $let2, $matrix)
   {
   global $chemgrp_matrix;
   if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
   foreach($matrix as $rule)
      if ((in_array($let1, $rule)) and (in_array($let2, $rule))) return TRUE;
   return FALSE;
   }

// getpattern() retrieves the pattern string from the pattern database ($patternDB array).
function getpattern($pattern)
   {
   global $patterndb;
   return $patterndb[$pattern];
   }

// This class allows the use of customized substitution matrices.  See tech doc for details.
class SubMatrix
{
var $rules;

// submatrix simply initializes the rules property to the empty array.
function SubMatrix()
   {
   $this->rules = array();
   }

// addrule() adds a rule to the substitution matrix.
function addrule($x)
   {
   $x = func_get_args();
   // if (isset($this->rules) == FALSE) $this->rules = array();
   array_push($this->rules, $x);
   }
}

class SeqMatch
{
var $result;
var $hamdist;
var $levdist;

// hamdist() computes the Hamming Distance between two strings or Seq objects 
// of equal length.  For more information, consult the technical reference.

function hamdist($seq1, $seq2)
   {
   // If $seq1 is a Seq object, we use its sequence property to compute Hamming Distance.
   if (gettype($seq1) == "object") $string1 = $seq1->sequence;
   elseif (gettype($seq1) == "string") $string1 = $seq1;

   // If $seq2 is a Seq object, we use its sequence property to compute Hamming Distance.
   if (gettype($seq2) == "object") $string2 = $seq2->sequence;
   elseif (gettype($seq2) == "string") $string2 = $seq2;

   // We terminate code execution if the two strings differ in length.
   if (strlen($string1) != strlen($string2))
      die("Both sequence must be of the same length!");

   $len = strlen($string1);
   // Initialize the hamming distance to 0 (no difference between two strings).
   $distance = 0;

   // Match the two strings, character by character.  If they are NOT
   // identical, increment $distance by 1.
   for($i = 0; $i < $len; $i++)
      {
      $let1 = substr($string1, $i, 1);
      $let2 = substr($string2, $i, 1);
      if ($let1 != $let2) $distance++;
      }
   return $distance;
   }

// levdist() computes the Levenshtein Distance between two strings or Seq objects 
// with equal/unequal lengths.  You can pass custom values for cost of insertion,
// replacement, and deletion.  If you don't pass any, they are assumed to be 1.
// For more information, see technical reference.

function levdist($seq1, $seq2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1)
   {
   // If $seq1 is a Seq object, we use its sequence property to compute Levenshtein Distance.
   if (gettype($seq1) == "object") $string1 = $seq1->sequence;
   elseif (gettype($seq1) == "string") $string1 = $seq1;

   // If $seq2 is a Seq object, we use its sequence property to compute Levenshtein Distance.
   if (gettype($seq2) == "object") $string2 = $seq2->sequence;
   elseif (gettype($seq2) == "string") $string2 = $seq2;

   // Check the lengths of the two strings.  If they exceed 255 characters, terminate code.
   if (strlen($string1) > 255) die("String length must not exceed 255 characters!");
   if (strlen($string2) > 255) die("String length must not exceed 255 characters!");

   // Compute and return the Levenshtein Distance using PHP's built-in levenshtein() function.
   return levenshtein($string1, $string2, $cost_ins, $cost_rep, $cost_del);
   }

// xlevdist() is an extended version of levdist() which accepts strings with length
// greater than 255 but not to exceed 1024 (which takes my CPU 18 seconds to compute).
// The only drawback to xlevdist is that the cost of insertion, deletion, and replacement
// is fixed to 1.  I have yet to find a way to allow custom values for these.

function xlevdist($s, $t)
   {
   $n = strlen($s);
   $m = strlen($t);

   if (($n > 1024) or ($m > 1024)) die("String length must not exceed 1024 characters");

   // initialize the array
   $values = array();
   $temp = array();
   $temp[0] = 0;

   for($j = 1; $j <= $m; $j++)
      $temp[$j] = 0;

   $values[0] = $temp;
   for($i = 1; $i <= $n; $i++)
      $values[$i] = $temp;

   for($i = 1; $i <= $n; $i++)
      {
      $lets = substr($s, $i-1, 1);
      for($j = 1; $j <= $m; $j++)
         {
         $lett = substr($t, $j-1, 1);
         if ($lets == $lett) $cost = 0;
         else $cost = 1;

         // "normal" values of $up, $left, and $upleft
         if ($j > 1) $up = $values[$i][$j-1];
         else $up = FALSE;
         if ($i > 1) $left = $values[$i-1][$j];
         else $left = FALSE;
         if (($i > 1) and ($j > 1)) $upleft = $values[$i-1][$j-1];
         else $upleft = FALSE;

         if ($i == 1)
            {
            if ($j == 1) $value = $cost;
            elseif ($cost == 0) $value = $cost;
            else $value = $up + 1;
            }
         else
            {
            // if at the first or topmost row, there is no upleft and above.
            if ($j == 1)
               {
               if ($cost == 0) $value = $cost;
               else $value = $left + 1;
               }
            else $value = getmin($up + 1, $left + 1, $upleft + $cost);
            }
         $values[$i][$j] = $value;
         } // closes for($j ...
      } // closes for($i ...
      return $values[$n][$m];
   } // closes function xlevdist()

/*
The match() method accepts two sequence strings (not objects) of equal length,
and returns a sequence match result string, according to the following rules:

  If there is an exact match, return the amino acid symbol.
  If there is a partial match, return a plus sign.
  If there is no match, return a whitespace character.
*/

function match($str1, $str2, $matrix, $equal, $partial = "+", $nomatch = ".")
   {
   global $chemgrp_matrix;

   // if the user chose not to use a custom submatrix, use the default one.
   if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
	
   // if the strings differ in length, terminate code execution.
   if (strlen($str1) != strlen($str2))
      die("Cannot match sequences with unequal lengths");
   $resultstr = "";
   $seqlength = strlen($str1);
	
   // Match the two strings, character by character.  Each call to compare_letter()
   // function returns a "result character" which is appended to a "result string".
   for($i = 0; $i < $seqlength; $i++)
      {
      $let1 = substr($str1, $i, 1);
      $let2 = substr($str2, $i, 1);
      $resultstr = $resultstr . compare_letter($let1, $let2, $matrix, $equal, $partial, $nomatch);
      }
		
   // Assign "result string" to the result property of the calling SeqMatch object. 
   $this->result = $resultstr;
	
   // Return the result string.  While this line and the line above seems redundant, their
   // presence here actually permits programmers to write more compact code.
   return $resultstr;
   }
}
?>

Back to Top.
Back to Home Page.

 


Copyright © 2003 by Sergio Gregorio, Jr.
All rights reserved.