BioPHP - Miscellaneus Class
Original code submitted by josebaCode bellow is covered by GNU GPL v2 license.
Description
Last change: 2010/10/18 21:01 | Recent Changes | Original description(short for \"et cetera\") Contains miscellaneous code
Code
Last change: 2010/10/18 21:01 | Recent Changes | Download | Original code<?php /* ETC.INC contains definitions for the SubMatrix and SeqMatch classes. It also contains helper functions such as is_blank(), isa_qualifier(), firstchar(), left(), etc. */ $patterndb = array(\"_StartCodon\" => \"AUG\", \"_EndCodon\" => \"[UAA,UAG,UGA]\"); // trim_element() removes leading and trailing spaces from a string. In conjunction // with the array_walk() function, it removes spaces from each element of an array. function trim_element(&$value, $key) { $value = trim($value); } /* is_false() tests if a value is a boolean false and not a zero (0). This is necessary to correctly interpret the return value of some PHP functions like strpos(). strpos() returns a zero (0) if a string is found at the beginning of a larger string, and FALSE if it cannot find that string within the larger string. In PHP, FALSE equals 0. */ function is_false($value) { if ( (gettype($value) == \"boolean\") and ($value == FALSE) ) return TRUE; else return FALSE; } // rem_right() removes $charcount characters from the right (end) of a string. function rem_right($str, $charcount = 1) { return substr($str, 0, strlen($str)-$charcount); } // intrim() removes \"internal spaces\" (as opposed to leading and trailing spaces) from a string. function intrim($string) { return eregi_replace(\' \', \'\', $string); } // getmin() gets the minimum of three (usually numeric) values $x, $y, and $z. // For now, this can\'t handle situations when one or more arguments is FALSE. function getmin($x, $y, $z) { if ($x < $y) if ($x < $z) return $x; else return $z; else if ($y < $z) return $y; else return $z; } // is_even() tests if an integer is an even number. function is_even($integer) { if (($integer/2) == ((int) ($integer/2))) return TRUE; else return FALSE; } // is_odd() tests if an integer is an odd number. This is the opposite of is_even(). function is_odd($integer) { if (($integer/2) != ((int) ($integer/2))) return TRUE; else return FALSE; } // is_blankstr() tests if a value is a blank string (\"\"). Like is_false(), this // helps interpret the value of some PHP functions or expressions. function is_blankstr($var) { if ( (gettype($var) == \"string\") and ($var == \"\") ) return TRUE; else return FALSE; } // I think this function should give way to or be replaced by is_blankstr(). // I haven\'t removed this yet as I have to check if some code still uses it. function is_blank($str) { if ($str == \"\") return true; else return false; } // firstchar() returns the first or beginning character of a string. function firstchar($str) { return left($str, 1); } // left() returns the first $numchars characters of a string. function left($str, $numchars) { return substr($str, 0, $numchars); } // right() returns the substring beginning at $numchars characters from the right end of a string. function right($str, $numchars) { return substr($str, strlen($str)-$numchars); } // compare_letter() compares two letters $let1 and $let2 and returns another letter // indicating if the two were exact matches, partial matches, or non-matches. function compare_letter($let1, $let2, $matrix, $equal, $partial = \"+\", $nomatch = \".\") { global $chemgrp_matrix; // if no custom substitution matrix was provided, use the default. if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules; // if no symbol for exact matches was provided, use the residue symbol. if (isset($equal) == FALSE) $equal = $let1; if ($let1 == $let2) return $equal; elseif (partial_match($let1, $let2, $matrix)) return $partial; else return $nomatch; } /* Algorithm: We abbreviate substitution matrix to \"submatrix\". Each element in a submatrix is an array of symbols that are considered \"partial matches\" of each other. Default submatrix: ( (\'G\',\'A\',\'V\',\'L\',\'I\'), (\'S\',\'T\'), (\'N\',\'Q\'), (\'F\',\'Y\',\'W\'), (\'C\', \'M\'), (\'P\'), (\'D\',\'E\'), (\'K\',\'R\',\'H\'), (\'*\'), (\'X\') ) 1) Check if both $let1 and $let2 appear in the first element (G,A,V,L,I) of the substitution matrix. 2) If they are, you\'ve found a \"hit\", and $let1 and $let2 are partial matches. Return a TRUE value. If they are not, then go to the next element in the substitution matrix. Repeat steps 1 and 2 until you reach a submatrix element where both $let1 and $let2 appear, or until the last element in the submatrix has been checked. 3) If you reach the last submatrix element without a \"hit\", return a FALSE value. NOTE: This will not warn if you $let1 and/or $let2 is nowhere to be found in the whole submatrix. */ function partial_match($let1, $let2, $matrix) { global $chemgrp_matrix; if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules; foreach($matrix as $rule) if ((in_array($let1, $rule)) and (in_array($let2, $rule))) return TRUE; return FALSE; } // getpattern() retrieves the pattern string from the pattern database ($patternDB array). function getpattern($pattern) { global $patterndb; return $patterndb[$pattern]; } // This class allows the use of customized substitution matrices. See tech doc for details. class submatrix { var $rules; // submatrix simply initializes the rules property to the empty array. function submatrix() { $this->rules = array(); } // addrule() adds a rule to the substitution matrix. function addrule($x) { $x = func_get_args(); // if (isset($this->rules) == FALSE) $this->rules = array(); array_push($this->rules, $x); } } class SeqMatch { var $result; var $hamdist; var $levdist; // hamdist() computes the Hamming Distance between two strings or Seq objects // of equal length. For more information, consult the technical reference. function hamdist($seq1, $seq2) { // If $seq1 is a Seq object, we use its sequence property to compute Hamming Distance. if (gettype($seq1) == \"object\") $string1 = $seq1->sequence; elseif (gettype($seq1) == \"string\") $string1 = $seq1; // If $seq2 is a Seq object, we use its sequence property to compute Hamming Distance. if (gettype($seq2) == \"object\") $string2 = $seq2->sequence; elseif (gettype($seq2) == \"string\") $string2 = $seq2; // We terminate code execution if the two strings differ in length. if (strlen($string1) != strlen($string2)) die(\"Both sequence must be of the same length!\"); $len = strlen($string1); // Initialize the hamming distance to 0 (no difference between two strings). $distance = 0; // Match the two strings, character by character. If they are NOT // identical, increment $distance by 1. for($i = 0; $i < $len; $i++) { $let1 = substr($string1, $i, 1); $let2 = substr($string2, $i, 1); if ($let1 != $let2) $distance++; } return $distance; } // levdist() computes the Levenshtein Distance between two strings or Seq objects // with equal/unequal lengths. You can pass custom values for cost of insertion, // replacement, and deletion. If you don\'t pass any, they are assumed to be 1. // For more information, see technical reference. function levdist($seq1, $seq2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1) { // If $seq1 is a Seq object, we use its sequence property to compute Levenshtein Distance. if (gettype($seq1) == \"object\") $string1 = $seq1->sequence; elseif (gettype($seq1) == \"string\") $string1 = $seq1; // If $seq2 is a Seq object, we use its sequence property to compute Levenshtein Distance. if (gettype($seq2) == \"object\") $string2 = $seq2->sequence; elseif (gettype($seq2) == \"string\") $string2 = $seq2; // Check the lengths of the two strings. If they exceed 255 characters, terminate code. if (strlen($string1) > 255) die(\"String length must not exceed 255 characters!\"); if (strlen($string2) > 255) die(\"String length must not exceed 255 characters!\"); // Compute and return the Levenshtein Distance using PHP\'s built-in levenshtein() function. return levenshtein($string1, $string2, $cost_ins, $cost_rep, $cost_del); } // xlevdist() is an extended version of levdist() which accepts strings with length // greater than 255 but not to exceed 1024 (which takes my CPU 18 seconds to compute). // The only drawback to xlevdist is that the cost of insertion, deletion, and replacement // is fixed to 1. I have yet to find a way to allow custom values for these. function xlevdist($s, $t) { $n = strlen($s); $m = strlen($t); if (($n > 1024) or ($m > 1024)) die(\"String length must not exceed 1024 characters\"); // initialize the array $values = array(); $temp = array(); $temp[0] = 0; for($j = 1; $j <= $m; $j++) $temp[$j] = 0; $values[0] = $temp; for($i = 1; $i <= $n; $i++) $values[$i] = $temp; for($i = 1; $i <= $n; $i++) { // OPENS for($i = 1; $i <= $n; $i++) $lets = substr($s, $i-1, 1); for($j = 1; $j <= $m; $j++) { // OPENS for($j = 1; $j <= $m; $j++) $lett = substr($t, $j-1, 1); if ($lets == $lett) $cost = 0; else $cost = 1; // \"normal\" values of $up, $left, and $upleft if ($j > 1) $up = $values[$i][$j-1]; else $up = FALSE; if ($i > 1) $left = $values[$i-1][$j]; else $left = FALSE; if (($i > 1) and ($j > 1)) $upleft = $values[$i-1][$j-1]; else $upleft = FALSE; if ($i == 1) { if ($j == 1) $value = $cost; elseif ($cost == 0) $value = $cost; else $value = $up + 1; } else { // if at the first or topmost row, there is no upleft and above. if ($j == 1) { if ($cost == 0) $value = $cost; else $value = $left + 1; } else $value = getmin($up + 1, $left + 1, $upleft + $cost); } $values[$i][$j] = $value; } // CLOSES for($j = 1; $j <= $m; $j++) } // CLOSES for($i = 1; $i <= $n; $i++) return $values[$n][$m]; } // closes function xlevdist() /* The match() method accepts two sequence strings (not objects) of equal length, and returns a sequence match result string, according to the following rules: If there is an exact match, return the amino acid symbol. If there is a partial match, return a plus sign. If there is no match, return a whitespace character. */ function match($str1, $str2, $matrix, $equal, $partial = \"+\", $nomatch = \".\") { global $chemgrp_matrix; // if the user chose not to use a custom submatrix, use the default one. if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules; // if the strings differ in length, terminate code execution. if (strlen($str1) != strlen($str2)) die(\"Cannot match sequences with unequal lengths\"); $resultstr = \"\"; $seqlength = strlen($str1); // Match the two strings, character by character. Each call to compare_letter() // function returns a \"result character\" which is appended to a \"result string\". for($i = 0; $i < $seqlength; $i++) { $let1 = substr($str1, $i, 1); $let2 = substr($str2, $i, 1); $resultstr = $resultstr . compare_letter($let1, $let2, $matrix, $equal, $partial, $nomatch); } // Assign \"result string\" to the result property of the calling SeqMatch object. $this->result = $resultstr; // Return the result string. While this line and the line above seems redundant, their // presense here actually permits programmers to write more compact code. return $resultstr; } } ?>