BioPHP - Sequence Aligment Class
Original code submitted by josebaCode bellow is covered by GNU GPL v2 license.
Description
Last change: 2010/10/18 21:01 | Recent Changes | Original description(short for \"sequence alignment\") Contains code for the SeqAlign Class.
Code
Last change: 2010/10/18 21:01 | Recent Changes | Download | Original code<?php require_once(\"seq.inc\"); /* SeqAlign - represents the result of an alignment performed by various third-party software such as ClustalW. The alignment is usually found in a file that uses a particular format. Right now, my code supports only FASTA and CLUSTAL formats. SeqAlign properties and methods allow users to perform post-alignment operations, manipulations, etc. */ class SeqAlign { // OPENS class SeqAlign var $length; var $seq_count; var $gap_count; var $seqset; var $seqptr; var $is_flush; // sort_alpha rearranges the sequences in an alignment set alphabetically by their sequence id. // In addition, you can specify if you wish it be in ascending or descending order via $option. function sort_alpha($option = \"ASC\") { $temp = array(); foreach($this->seqset as $seqitem) { $key = $seqitem->id . str_pad($seqitem->start, 9, \"0\", STR_PAD_LEFT); $temp[$key] = $seqitem; } $option = strtoupper($option); if ($option == \"ASC\") asort($temp); elseif ($option == \"DESC\") arsort($temp); else die(\"Invalid argument #1 passed to SORT_ALPHA() method!\"); $temp2 = array(); foreach($temp as $key => $value) $temp2[] = $value; $this->seqset = $temp2; } // first() moves the sequence pointer to the first sequence in the alignment set. function first() { $this->seqptr = 0; } // last() moves the sequence pointer to the last sequence in the alignment set. function last() { $this->seqptr = $this->seq_count - 1; } // prev() moves the sequence pointer to the sequence before the current one. function prev() { if ($this->seqptr > 0) $this->seqptr--; } // next() moves the sequence pointer to the sequence after the current one. function next() { if ($this->seqptr < $this->seq_count-1) $this->seqptr++; } // fetch() retrieves a particular sequence (identified by its index number) from an alignment set. function fetch($index = \"\") { if (strlen($index) == 0) $index = $this->seqptr; return $this->seqset[$index]; } // get_length() returns the lenght of the longest sequence in an alignment set. function get_length() { $maxlen = 0; foreach($this->seqset as $seqitem) if ($seqitem->length > $maxlen) $maxlen = $seqitem->length; return $maxlen; } // get_gap_count() counts the number of gaps (\"-\") found in all sequences in an alignment set. function get_gap_count() { $gapctr = 0; foreach($this->seqset as $seqitem) $gapctr += $seqitem->symfreq(\"-\"); return $gapctr; } // get_is_flush() tests if all the sequences in an alignment set have the same length. function get_is_flush() { $samelength = TRUE; $ctr = 0; foreach($this->seqset as $element) { $ctr++; $currlen = $element->seqlen(); if ($ctr == 1) { $prevlen = $currlen; continue; } if ($currlen != $prevlen) { $samelength = FALSE; break; } $prevlen = $currlen; } return $samelength; } // char_at_res() returns the character found at a given residue number in a given sequence. function char_at_res($seqidx, $res) { $seqobj = $this->seqset[$seqidx]; if ($res > $seqobj->end) return FALSE; if ($res < $seqobj->start) return FALSE; $len = $seqobj->seqlen(); $nongap_count = $res - $seqobj->start + 1; $nongap_ctr = 0; for($col = 0; $col < $len; $col++) { $currlet = substr($seqobj->sequence, $col, 1); if ($currlet == \"-\") continue; else { $nongap_ctr++; if ($nongap_ctr == $nongap_count) return $currlet; } } } // substr_bw_res() gets the substring between two residues in a sequence that is part of an alignment set. function substr_bw_res($seqidx, $res_beg, $res_end = \"\") { $seqobj = $this->seqset[$seqidx]; // Later, you can return a code which identifies the type of error. if ($res_end > $seqobj->end) return FALSE; if ($res_beg < $seqobj->start) return FALSE; if ( (gettype($res_end) == \"string\") and (strlen($res_end) == 0) ) $res_end = $seqobj->end; $res_begctr = $res_beg - $seqobj->start + 1; $res_endctr = $res_end - $seqobj->start + 1; $len = $seqobj->seqlen(); $nongap_ctr = 0; $subseq = \"\"; for($col = 0; $col < $len; $col++) { $currlet = substr($seqobj->sequence, $col, 1); if ($currlet != \"-\") $nongap_ctr++; if ( ($nongap_ctr >= $res_begctr) and ($nongap_ctr <= $res_endctr) ) $subseq .= $currlet; elseif ($nongap_ctr > $res_endctr) break; } return $subseq; } // col2res() converts a column number to a residue number in a sequence that is part of an alignment set. function col2res($seqidx, $col) { $seqobj = $this->seqset[$seqidx]; // Later, you can return a code which identifies the type of error. if ($col > $seqobj->seqlen() - 1) return FALSE; if ($col < 0) return FALSE; $nongap_ctr = 0; for($i = 0; $i <= $col; $i++) { $currlet = substr($seqobj->sequence, $i, 1); if ($currlet != \"-\") $nongap_ctr++; } if ($currlet == \"-\") return \"-\"; else return ($seqobj->start + $nongap_ctr - 1); } // res2col() converts a residue number to a column number in a sequence in an alignment set. function res2col($seqidx, $res) { $seqobj = $this->seqset[$seqidx]; // Later, you can return a code which identifies the type of error. if ($res > $seqobj->end) return FALSE; if ($res < $seqobj->start) return FALSE; $len = $seqobj->seqlen(); $nongap_count = $res - $seqobj->start + 1; $nongap_ctr = 0; for($col = 0; $col < $len; $col++) { $currlet = substr($seqobj->sequence, $col, 1); if ($currlet == \"-\") continue; else { $nongap_ctr++; if ($nongap_ctr == $nongap_count) return $col; } } } // subalign() returns a subset of consecutive sequences in an alignment set. function subalign($beg, $end) { if (($beg < 0) or ($end < 0)) die(\"Invalid argument passed to SUBALIGN() method!\"); if (($beg > $this->seq_count-1) or ($end > $this->seq_count-1)) die(\"Invalid argument passed to SUBALIGN() method!\"); $new_seqset = array(); $new_align = new SeqAlign(); $new_align->seqset = array_slice($this->seqset, $beg, $end-$beg+1); $new_align->length = $new_align->get_length(); $new_align->seq_count = $end - $beg + 1; $new_align->gap_count = $new_align->get_gap_count(); $new_align->seqptr = 0; $new_align->is_flush = $new_align->get_is_flush(); return $new_align; } // select() returns a set of (possibly non-consecutive) sequences in an alignment set. function select() { $arglist = func_get_args(); if (count($arglist) == 0) die(\"Must pass at least one argument to SELECT() method!\"); $new_seqset = array(); $new_align = new SeqAlign(); $ctr = 0; foreach($arglist as $seqindex) { $new_seqset[] = $this->seqset[$seqindex]; $ctr++; } $new_align->seqset = $new_seqset; $new_align->length = $new_align->get_length(); $new_align->seq_count = count($arglist); $new_align->gap_count = $new_align->get_gap_count(); $new_align->seqptr = 0; $new_align->is_flush = $new_align->get_is_flush(); return $new_align; } // res_var() identifies the positions of variant and invariant (conserved) residues in an alignment set. function res_var($threshold = 100) { // for now, assume all the sequences are equal in length. $alphabet = array(\'A\',\'B\',\'C\',\'D\',\'E\',\'F\',\'G\',\'H\',\'I\',\'J\',\'K\',\'L\',\'M\', \'N\',\'O\',\'P\',\'Q\',\'R\',\'S\',\'T\',\'U\',\'V\',\'W\',\'X\',\'Y\',\'Z\'); $all_pos = array(); $invar_pos = array(); $var_pos = array(); $firstseq = $this->seqset[0]; $seqlength = strlen($firstseq->sequence); $globfreq = array(); for($i = 0; $i < count($alphabet); $i++) { $currlet = $alphabet[$i]; $globfreq[$currlet] = 0; } for($i = 0; $i < $seqlength; $i++) { $freq = $globfreq; for($j = 0; $j < $this->seq_count; $j++) { $currseq = $this->seqset[$j]; $currlet = substr($currseq->sequence, $i, 1); $freq[$currlet]++; } arsort($freq); list($key, $value) = each($freq); $maxpercent = ($value/$this->seq_count) * 100; if ($maxpercent >= $threshold) array_push($invar_pos, $i); else array_push($var_pos, $i); } $all_pos[\"INVARIANT\"] = $invar_pos; $all_pos[\"VARIANT\"] = $var_pos; return $all_pos; } // consensus() returns the consensus string for an alignment set. See technical reference for details. function consensus($threshold = 100) { // for now, assume all the sequences are equal in length. $alphabet = array(\'A\',\'B\',\'C\',\'D\',\'E\',\'F\',\'G\',\'H\',\'I\',\'J\',\'K\',\'L\',\'M\', \'N\',\'O\',\'P\',\'Q\',\'R\',\'S\',\'T\',\'U\',\'V\',\'W\',\'X\',\'Y\',\'Z\'); $resultstr = \"\"; $firstseq = $this->seqset[0]; $seqlength = strlen($firstseq->sequence); $globfreq = array(); for($i = 0; $i < count($alphabet); $i++) { $currlet = $alphabet[$i]; $globfreq[$currlet] = 0; } for($i = 0; $i < $seqlength; $i++) { $freq = $globfreq; for($j = 0; $j < $this->seq_count; $j++) { $currseq = $this->seqset[$j]; $currlet = substr($currseq->sequence, $i, 1); $freq[$currlet]++; } arsort($freq); list($key, $value) = each($freq); $maxpercent = ($value/$this->seq_count) * 100; if ($maxpercent >= $threshold) $resultstr = $resultstr . $key; else $resultstr = $resultstr . \"?\"; } return $resultstr; } // add_seq() adds a sequence to an alignment set. function add_seq($seqobj) { if (gettype($seqobj) == \"object\") { array_push($this->seqset, $seqobj); if ($seqobj->seqlen() > $this->length) $this->length = $seqobj->seqlen(); $this->gap_count += $seqobj->symfreq(\"-\"); if ($seqobj->seqlen() > $this->length) $this->length = $seqobj->seqlen(); if ($this->is_flush == TRUE) { if ($this->seq_count >= 1) { $firstseq = $this->seqset[0]; if ($seqobj->seqlen() != $firstseq->seqlen()) $this->is_flush = FALSE; } } $this->seq_count++; return count($this->seqset); } elseif (gettype($seqobj) == \"string\") { print \"NOT YET OPERATIONAL\"; } } // del_seq() deletes or removes a sequence from an alignment set. function del_seq($seqobj) { $seqid = $seqobj; $tempset = array(); foreach($this->seqset as $element) { if ($element->id != $seqid) array_push($tempset, $element); else $removed_seq = $element; } // Updates the value of the SEQSET property of the SEQALIGN object. $this->seqset = $tempset; // Updates the value of the SEQ_COUNT property of the SEQALIGN object. $this->seq_count--; // Updates the value of the LENGTH property of the SEQALIGN object. if ($removed_seq->seqlen() == $this->length) { $maxlen = 0; foreach($this->seqset as $element) if ($element->seqlen() > $maxlen) $maxlen = $element->seqlen(); $this->length = $maxlen; } // Updates the value of the GAP_COUNT property of the SEQALIGN object. $this->gap_count -= $removed_seq->symfreq(\"-\"); // Updates the value of the IS_FLUSH property of the SEQALIGN object. if ($this->is_flush == FALSE) { // Take note that seq_count has already been decremented in the code above. if ($this->seq_count <= 1) $this->is_flush = TRUE; else { $samelength = TRUE; $ctr = 0; foreach($this->seqset as $element) { $ctr++; $currlen = $element->seqlen(); if ($ctr == 1) { $prevlen = $currlen; continue; } if ($currlen != $prevlen) { $samelength = FALSE; break; } $prevlen = $currlen; } if ($samelength == TRUE) $this->is_flush = TRUE; } } // Return the new number of sequences in the alignment set AFTER delete operation. return count($this->seqset); } // SeqAlign() is the constructor method for the SeqAlign class. It initializes class properties. function SeqAlign($filename = \"\", $format = \"FASTA\") { // OPENS function SeqAlign if (strlen($filename) == 0) { $this->seq_count = 0; $this->length = 0; $this->seqptr = 0; $this->gap_count = 0; $this->is_flush = TRUE; $this->seqset = array(); return; } if ($format == \"FASTA\") { $flines = file($filename); $seqctr = 0; $maxlen = 0; $maxctr = 0; $gapctr = 0; $this->seqset = array(); $samelength = TRUE; while ( list($no, $linestr) = each($flines) ) { // OPENS while ( list($no, $linestr) = each($flines) ) if (substr($linestr, 0, 1) == \">\") { // start of a new sequence $seqctr++; $seqlen = strlen($seqstr); $seq_obj = new seq(); $seq_obj->id = $prev_id; $seq_obj->length = $seqlen; $seq_obj->sequence = $seqstr; $seq_obj->start = $prev_start; $seq_obj->end = $prev_end; $localgaps = $seq_obj->symfreq(\"-\"); $gapctr += $seq_obj->symfreq(\"-\"); if ($seqctr > 1) { if ($seqlen > $maxlen) $maxlen = $seqlen; if (($seqctr >= 3) and ($seqlen != $prev_len)) $samelength = FALSE; array_push($this->seqset, $seq_obj); } $seqstr = \"\"; $words = preg_split(\"/[\\>\\/]/\", substr($linestr, 1)); $prev_id = $words[0]; $indexes = preg_split(\"/-/\", $words[1]); $prev_start = $indexes[0]; $prev_end = $indexes[1]; $prev_len = $seqlen; continue; } else { $seqstr = $seqstr . trim($linestr); } } // CLOSES while ( list($no, $linestr) = each($flines) ) $seqlen = strlen($seqstr); $seq_obj = new seq(); $seq_obj->id = $prev_id; $seq_obj->start = $prev_start; $seq_obj->end = $prev_end; $seq_obj->length = $seqlen; $seq_obj->sequence = $seqstr; $localgaps = $seq_obj->symfreq(\"-\"); $gapctr += $seq_obj->symfreq(\"-\"); if ($seqctr > 1) { if ($seqlen > $maxlen) $maxlen = $seqlen; if (($seqctr >= 3) and ($seqlen != $prev_len)) $samelength = FALSE; array_push($this->seqset, $seq_obj); } $this->seq_count = $seqctr; $this->length = $maxlen; $this->seqptr = 0; $this->gap_count = $gapctr; $this->is_flush = $samelength; } // CLOSES if ($format == \"FASTA\") elseif ($format == \"CLUSTAL\") { // OPENS elseif ($format == \"CLUSTAL\") $flines = file($filename); $namelist = array(); $conserve_line = \"\"; $linectr = 0; while( list($no, $linestr) = each($flines) ) { // OPENS while( list($no, $linestr) = each($flines) ) $linectr++; if ($linectr == 1) continue; // skip the first line. if (strlen(trim($linestr)) == 0) continue; // ignore blank lines. $seqname = trim(substr($linestr, 0, 16)); $seqline = substr($linestr, 16, 60); if (strlen(trim($seqname)) == 0) { $conserve_line .= substr($seqline, 0, $lastlen); continue; } if (in_array($seqname, $namelist) == FALSE) { $namelist[] = $seqname; $seq[$seqname] = $seqline; $lastlen = strlen(trim($seqline)); } else { $seq[$seqname] .= trim($seqline); $lastlen = strlen(trim($seqline)); } } // CLOSES while( list($no, $linestr) = each($flines) ) $this->seqset = array(); $gapctr = 0; foreach($seq as $key => $value) { $seq_obj = new seq(); $seq_obj->id = $key; $seq_obj->length = strlen($value); $seq_obj->sequence = $value; $seq_obj->start = 0; $seq_obj->end = $seq_obj->length - 1; $gapctr += $seq_obj->symfreq(\"-\"); array_push($this->seqset, $seq_obj); } $this->seq_count = count($namelist); $this->length = strlen($conserve_line); $this->seqptr = 0; $this->gap_count = $gapctr; $this->is_flush = TRUE; } // CLOSES elseif ($format == \"CLUSTAL\") } // CLOSES function SeqAlign } // CLOSES class SeqAlign ?>