Login | Register

Info | Home

BioPHP - Miscellaneus Class (original)

Original code submitted by joseba
Code bellow is covered by GNU GPL v2 license.

Description

Last change: 2005/12/29 14:33 | Recent Changes
(short for \"et cetera\") Contains miscellaneous code

Code

Last change: 2005/12/29 14:33 | Download original | Recent Changes | Original code
<?php
/* ETC.INC contains definitions for the SubMatrix and SeqMatch classes.
   It also contains helper functions such as is_blank(), isa_qualifier(), 
   firstchar(), left(), etc. */

$patterndb = array(\"_StartCodon\" => \"AUG\", \"_EndCodon\" => \"[UAA,UAG,UGA]\");

// trim_element() removes leading and trailing spaces from a string.  In conjunction 
// with the array_walk() function, it removes spaces from each element of an array.
function trim_element(&$value, $key)
        {
        $value = trim($value);
        }

/*
is_false() tests if a value is a boolean false and not a zero (0).  This is necessary 
to correctly interpret the return value of some PHP functions like strpos().  strpos()
returns a zero (0) if a string is found at the beginning of a larger string, and FALSE
if it cannot find that string within the larger string.  In PHP, FALSE equals 0. 
*/
function is_false($value)
        {
        if ( (gettype($value) == \"boolean\") and
                  ($value == FALSE) ) return TRUE;
        else return FALSE;
        }

// rem_right() removes $charcount characters from the right (end) of a string.
function rem_right($str, $charcount = 1)
        {
        return substr($str, 0, strlen($str)-$charcount);
        }

// intrim() removes \"internal spaces\" (as opposed to leading and trailing spaces) from a string.
function intrim($string)
        {
        return eregi_replace(\' \', \'\', $string); 
        }

// getmin() gets the minimum of three (usually numeric) values $x, $y, and $z.
// For now, this can\'t handle situations when one or more arguments is FALSE.
function getmin($x, $y, $z)
        {
        if ($x < $y)
                if ($x < $z) return $x;
                else return $z;
        else
                if ($y < $z) return $y;
                else return $z;
        }

// is_even() tests if an integer is an even number.
function is_even($integer)
        {
        if (($integer/2) == ((int) ($integer/2))) return TRUE;
        else return FALSE;
        }

// is_odd() tests if an integer is an odd number.  This is the opposite of is_even().
function is_odd($integer)
        {
        if (($integer/2) != ((int) ($integer/2))) return TRUE;
        else return FALSE;
        }

// is_blankstr() tests if a value is a blank string (\"\").  Like is_false(), this
// helps interpret the value of some PHP functions or expressions.
function is_blankstr($var)
        {
        if ( (gettype($var) == \"string\") and ($var == \"\") ) return TRUE;
        else return FALSE;
        }

// I think this function should give way to or be replaced by is_blankstr().
// I haven\'t removed this yet as I have to check if some code still uses it.
function is_blank($str)
        {
        if ($str == \"\") return true;
        else return false;
        }

// firstchar() returns the first or beginning character of a string.
function firstchar($str)
        {
        return left($str, 1);
        }

// left() returns the first $numchars characters of a string.
function left($str, $numchars)
        {
        return substr($str, 0, $numchars);
        }

// right() returns the substring beginning at $numchars characters from the right end of a string.
function right($str, $numchars)
        {
        return substr($str, strlen($str)-$numchars);
        }

// compare_letter() compares two letters $let1 and $let2 and returns another letter
// indicating if the two were exact matches, partial matches, or non-matches.
function compare_letter($let1, $let2, $matrix, $equal, $partial = \"+\", $nomatch = \".\")
        {
        global $chemgrp_matrix;

        // if no custom substitution matrix was provided, use the default.
        if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
        
        // if no symbol for exact matches was provided, use the residue symbol.
        if (isset($equal) == FALSE) $equal = $let1;
        
        if ($let1 == $let2) return $equal;
        elseif (partial_match($let1, $let2, $matrix)) return $partial;
        else return $nomatch;
        }

/* Algorithm:
   We abbreviate substitution matrix to \"submatrix\".  Each element in a submatrix is an array of
        symbols that are considered \"partial matches\" of each other.
        
   Default submatrix: 
        ( (\'G\',\'A\',\'V\',\'L\',\'I\'), (\'S\',\'T\'), (\'N\',\'Q\'), (\'F\',\'Y\',\'W\'), (\'C\', \'M\'), (\'P\'), (\'D\',\'E\'), (\'K\',\'R\',\'H\'), 
          (\'*\'), (\'X\') )

        1) Check if both $let1 and $let2 appear in the first element (G,A,V,L,I) of the substitution matrix.

        2) If they are, you\'ve found a \"hit\", and $let1 and $let2 are partial matches.  Return a TRUE value.
      If they are not, then go to the next element in the substitution matrix.  
                
                Repeat steps 1 and 2 until you reach a submatrix element where both $let1 and $let2 appear, or 
                until the last element in the submatrix has been checked.
                
        3) If you reach the last submatrix element without a \"hit\", return a FALSE value.
        
   NOTE: This will not warn if you $let1 and/or $let2 is nowhere to be found in the whole submatrix.                
*/
function partial_match($let1, $let2, $matrix)
        {
        global $chemgrp_matrix;
        if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
        foreach($matrix as $rule)
                if ((in_array($let1, $rule)) and (in_array($let2, $rule))) return TRUE;
        return FALSE;
        }

// getpattern() retrieves the pattern string from the pattern database ($patternDB array).
function getpattern($pattern)
        {
        global $patterndb;
        return $patterndb[$pattern];
        }

// This class allows the use of customized substitution matrices.  See tech doc for details.
class submatrix
{
var $rules;

// submatrix simply initializes the rules property to the empty array.
function submatrix()
        {
        $this->rules = array();
        }

// addrule() adds a rule to the substitution matrix.
function addrule($x)
        {
        $x = func_get_args();
        // if (isset($this->rules) == FALSE) $this->rules = array();
        array_push($this->rules, $x);
        }
}

class SeqMatch
{
var $result;
var $hamdist;
var $levdist;

// hamdist() computes the Hamming Distance between two strings or Seq objects 
// of equal length.  For more information, consult the technical reference.

function hamdist($seq1, $seq2)
   {
   // If $seq1 is a Seq object, we use its sequence property to compute Hamming Distance.
   if (gettype($seq1) == \"object\") $string1 = $seq1->sequence;
   elseif (gettype($seq1) == \"string\") $string1 = $seq1;

   // If $seq2 is a Seq object, we use its sequence property to compute Hamming Distance.
   if (gettype($seq2) == \"object\") $string2 = $seq2->sequence;
   elseif (gettype($seq2) == \"string\") $string2 = $seq2;

   // We terminate code execution if the two strings differ in length.
   if (strlen($string1) != strlen($string2))
      die(\"Both sequence must be of the same length!\");

   $len = strlen($string1);
   // Initialize the hamming distance to 0 (no difference between two strings).
   $distance = 0;

   // Match the two strings, character by character.  If they are NOT
   // identical, increment $distance by 1.
   for($i = 0; $i < $len; $i++)
      {
      $let1 = substr($string1, $i, 1);
      $let2 = substr($string2, $i, 1);
      if ($let1 != $let2) $distance++;
      }
   return $distance;
   }

// levdist() computes the Levenshtein Distance between two strings or Seq objects 
// with equal/unequal lengths.  You can pass custom values for cost of insertion,
// replacement, and deletion.  If you don\'t pass any, they are assumed to be 1.
// For more information, see technical reference.

function levdist($seq1, $seq2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1)
   {
   // If $seq1 is a Seq object, we use its sequence property to compute Levenshtein Distance.
   if (gettype($seq1) == \"object\") $string1 = $seq1->sequence;
   elseif (gettype($seq1) == \"string\") $string1 = $seq1;

   // If $seq2 is a Seq object, we use its sequence property to compute Levenshtein Distance.
   if (gettype($seq2) == \"object\") $string2 = $seq2->sequence;
   elseif (gettype($seq2) == \"string\") $string2 = $seq2;

   // Check the lengths of the two strings.  If they exceed 255 characters, terminate code.
   if (strlen($string1) > 255) die(\"String length must not exceed 255 characters!\");
   if (strlen($string2) > 255) die(\"String length must not exceed 255 characters!\");

   // Compute and return the Levenshtein Distance using PHP\'s built-in levenshtein() function.
   return levenshtein($string1, $string2, $cost_ins, $cost_rep, $cost_del);
   }

// xlevdist() is an extended version of levdist() which accepts strings with length
// greater than 255 but not to exceed 1024 (which takes my CPU 18 seconds to compute).
// The only drawback to xlevdist is that the cost of insertion, deletion, and replacement
// is fixed to 1.  I have yet to find a way to allow custom values for these.

function xlevdist($s, $t)
        {
        $n = strlen($s);
        $m = strlen($t);

        if (($n > 1024) or ($m > 1024)) die(\"String length must not exceed 1024 characters\");

        // initialize the array
        $values = array();
        $temp = array();
        $temp[0] = 0;

        for($j = 1; $j <= $m; $j++)
                $temp[$j] = 0;

        $values[0] = $temp;
        for($i = 1; $i <= $n; $i++)
                $values[$i] = $temp;

        for($i = 1; $i <= $n; $i++)
                { // OPENS for($i = 1; $i <= $n; $i++) 
                $lets = substr($s, $i-1, 1);
                for($j = 1; $j <= $m; $j++)
                        { // OPENS for($j = 1; $j <= $m; $j++)
                        $lett = substr($t, $j-1, 1);
                        if ($lets == $lett) $cost = 0;
                        else $cost = 1;

                        // \"normal\" values of $up, $left, and $upleft
                        if ($j > 1) $up = $values[$i][$j-1];
                        else $up = FALSE;
                        if ($i > 1) $left = $values[$i-1][$j];
                        else $left = FALSE;
                        if (($i > 1) and ($j > 1)) $upleft = $values[$i-1][$j-1];
                        else $upleft = FALSE;

                        if ($i == 1)
                                {
                                if ($j == 1) $value = $cost;
                                elseif ($cost == 0) $value = $cost;
                                else $value = $up + 1;
                                }
                        else
                                {
                                // if at the first or topmost row, there is no upleft and above.
                                if ($j == 1)
                                        {
                                        if ($cost == 0) $value = $cost;
                                        else $value = $left + 1;
                                        }
                                else $value = getmin($up + 1, $left + 1, $upleft + $cost);
                                }
                        $values[$i][$j] = $value;                        
                        } // CLOSES for($j = 1; $j <= $m; $j++)
                } // CLOSES for($i = 1; $i <= $n; $i++) 
                return $values[$n][$m];
        } // closes function xlevdist()

/*
The match() method accepts two sequence strings (not objects) of equal length,
and returns a sequence match result string, according to the following rules:

  If there is an exact match, return the amino acid symbol.
  If there is a partial match, return a plus sign.
  If there is no match, return a whitespace character.
*/

function match($str1, $str2, $matrix, $equal, $partial = \"+\", $nomatch = \".\")
        {
        global $chemgrp_matrix;

        // if the user chose not to use a custom submatrix, use the default one.
        if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
        
        // if the strings differ in length, terminate code execution.
        if (strlen($str1) != strlen($str2))
                die(\"Cannot match sequences with unequal lengths\");
        $resultstr = \"\";
        $seqlength = strlen($str1);
        
   // Match the two strings, character by character.  Each call to compare_letter()
        // function returns a \"result character\" which is appended to a \"result string\".
        for($i = 0; $i < $seqlength; $i++)
                {
                $let1 = substr($str1, $i, 1);
                $let2 = substr($str2, $i, 1);
                $resultstr = $resultstr . compare_letter($let1, $let2, $matrix, $equal, $partial, $nomatch);
                }
                
   // Assign \"result string\" to the result property of the calling SeqMatch object. 
        $this->result = $resultstr;
        
        // Return the result string.  While this line and the line above seems redundant, their
        // presense here actually permits programmers to write more compact code.
        return $resultstr;
        }
}
?>