BioPHP - Sequence Database Class (original)
Original code submitted by josebaCode bellow is covered by GNU GPL v2 license.
Last change: 2005/12/29 14:38 | Recent Changes(short for \"Sequence Database\") Contains code for the SeqDB Class.
Last change: 2005/12/29 14:38 | Download original | Recent Changes | Original code<?php require_once(\"\"); require_once(\"\"); // ================== FUNCTIONS ======================== /* We begin by describing parse_swissprot() first. parse_swissprot() parses the Feature Table lines (those that begin with FT) in a Swissprot data file, extracts the feature key name, from endpoint, to endpoint, and description, and stores them in a (simple) array. process_ft() then pushes this array into a larger associative array, called $swiss, which is also an attribute of the Seq object. It is assigned a key of the form: FT_<feature_key_name>. Examples are: FT_PEPTIDE, FT_DISULFID. */ function process_ft(&$swiss, $ft_r) { foreach($ft_r as $element) { $index = \"FT_\" . $element[0]; array_shift($element); if (count($swiss[$index]) == 0) { $swiss[$index] = array(); array_push($swiss[$index], $element); } else array_push($swiss[$index], $element); } } // at_entrystart() tests if the file pointer is at the start of a new sequence entry. function at_entrystart($linestr, $dbformat) { if ($dbformat == \"GENBANK\") return (substr($linestr,0,5) == \"LOCUS\"); elseif ($dbformat == \"SWISSPROT\") return (substr($linestr,0,2) == \"ID\"); } // get_entryid() gets the primary accession number of the sequence entry which we are // currently processing. This uniquely identifies a sequence entry. function get_entryid(&$flines, $linestr, $dbformat) { if ($dbformat == \"GENBANK\") return trim(substr($linestr, 12, 16)); elseif ($dbformat == \"SWISSPROT\") { list($lineno, $linestr) = each($flines); if (substr($linestr,0,2) == \"AC\") { $words = preg_split(\"/;/\", intrim(substr($linestr,5))); prev($flines); return $words[0]; } } } // line2r() copies the lines belonging to a single sequence entry into an array. function line2r($fpseq) { $flines = array(); while(1) { $linestr = fgets($fpseq, 101); $flines[] = $linestr; if (left($linestr,2) == \'//\') return $flines; } return FALSE; } // isa_qualifier() tests if the file pointer is at a line containing a feature qualifier. // This applies only to GenBank sequence files. function isa_qualifier($str) { if (firstchar($str) == \'/\') return true; else return false; } // fseekline() gets the byte offset (from beginning of file) of a particular line. The file is // identified by $fp file pointer, while the line is identified by $lineno, which is zero-based. function fseekline($fp, $lineno) { $linectr = 0; fseek($fp, 0); while(!feof($fp)) { $linestr = fgets($fp,101); if ($linectr == $lineno) { fseek($fp, $byteoff); return $byteoff; } $linectr++; $byteoff = ftell($fp); } } // bsrch_tabfile() searches for a particular sequence id ($seqid) within an *.IDX file // (identified by $fp file pointer), and returns data located in its $col-th column. function bsrch_tabfile($fp, $col, $seqid) { $linectr = 0; fseek($fp, 0); while(!feof($fp)) { fgets($fp, 41); $linectr++; } $lastline = $linectr; rewind($fp); if ($fp == FALSE) die(\"CANT OPEN FILE\"); $searchspace = $lastline; $floor = 0; $ceiling = $lastline - 1; while(1) { $offset = ((int) ($searchspace/2)); $lineno = $floor + $offset; fseekline($fp, $lineno); $word = preg_split(\"/\\s+/\", trim(fgets($fp,81))); if ($word[$col] == $seqid) { $word[] = $lineno; return $word; } elseif ($seqid > $word[$col]) { $floor = $lineno + 1; $searchspace = $ceiling - $floor + 1; if ($searchspace <= 0) return FALSE; } else { $ceiling = $lineno - 1; $searchspace = $ceiling - $floor + 1; if ($searchspace <= 0) return FALSE; } } // fclose($fpidx); } // ================== CLASSES ======================== class SeqDB { // OPENS definition of SEQDB CLASS. var $dbname; var $data_fn; var $data_fp; var $dir_fn; var $dir_fp; var $seqptr; var $seqcount; var $dbformat; var $bof; var $eof; // We need the functions bof() and eof() to determine if we\'ve reached the end of // file or not. // Two ways of doing this: 1) examine value of seqptr, or 2) maintain boolean variables eof and bof // first() positions the sequence pointer (i.e. the seqptr property of a Seq object) to // the first sequence in a database (SeqDB object). function first() { $this->seqptr = 0; } // last() positions the sequence pointer (i.e. the seqptr property of a Seq object) to // the last sequence in a database (SeqDB object). function last() { $this->seqptr = $this->seqcount-1; } // prev() (short for previous) positions the sequence pointer (i.e. the seqptr property of // a Seq object) to the sequence that comes before the current sequence. function prev() { if ($this->seqptr > 0) $this->seqptr--; else $this->bof = TRUE; } // next() positions the sequence pointer (i.e. the seqptr property of a Seq object) to the // sequence that comes after the current sequence. function next() { if ($this->seqptr < $this->seqcount-1) $this->seqptr++; else $this->eof = TRUE; } // fetch() retrieves all data from the specified sequence record and returns them in the // form of a Seq object. This method invokes one of several parser methods. function fetch() { if ($this->data_fn == \"\") die(\"Cannot invoke fetch() method from a closed object.\"); @$seqid = func_get_arg(0); // IDX and DIR files remain open for the duration of the FETCH() method. $fp = fopen($this->data_fn, \"r\"); $fpdir = fopen($this->dir_fn, \"r\"); if ($seqid != FALSE) { $idx_r = bsrch_tabfile($fp, 0, $seqid); if ($idx_r == FALSE) return FALSE; else $this->seqptr = $idx_r[3]; } else { // For now, SEQPTR determines CURRENT SEQUENCE ID. Alternative is to track curr line. fseekline($fp, $this->seqptr); $idx_r = preg_split(\"/\\s+/\", trim(fgets($fp, 81))); } $dir_r = bsrch_tabfile($fpdir, 0, $idx_r[1]); $fpseq = fopen($dir_r[1], \"r\"); fseekline($fpseq, $idx_r[2]); $flines = line2r($fpseq); $myseq = new seq(); if ($this->dbformat == \"GENBANK\") $myseq = $this->parse_id($flines); elseif ($this->dbformat == \"SWISSPROT\") $myseq = $this->parse_swissprot($flines); fclose($fp); fclose($fpdir); fclose($fpseq); return $myseq; } // parse_swissprot() parses a Swissprot data file and returns a Seq object containing parsed data. function parse_swissprot($flines) { // OPENS parse_swissprot() function $accession = array(); $date_r = array(); $desc = \"\"; $desc_lnctr = 0; $gename_r = array(); $os_r = array(); $os_linectr = 0; $os_str = \"\"; $oc_linectr = 0; $oc_str = \"\"; $ref_r = array(); $ra_r = array(); $ra_ctr = 0; $ra_str = \"\"; $rl_ctr = 0; $rl_str = \"\"; $db_r = array(); $ft_r = array(); $kw_str = \"\"; $kw_r = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if (left($linestr, 2) == \"ID\") { // OPENS if (left($linestr, 2) == \"ID\") $words = preg_split(\"/;/\", substr($linestr, 5)); $endc = preg_split(\"/\\s/\", $words[0]); $entry_name = $endc[0]; $namesrc = preg_split(\"/_/\", $entry_name); $protein_name = $namesrc[0]; $protein_source = $namesrc[1]; $data_class = $endc[1]; $moltype = $words[1]; $length = (int) substr($words[2], 0, strlen($words[2])-4); } // CLOSES if (left($linestr, 2) == \"ID\") if (left($linestr, 2) == \"AC\") { // OPENS if (left($linestr, 2) == \"AC\") $accstr = $linedata; $accstr = substr($accstr, 0, strlen($accstr)-1); $accline = preg_split(\"/;/\", intrim($accstr)); $accession = array_merge($accession, $accline); } // CLOSES if (left($linestr, 2) == \"AC\") if (left($linestr, 2) == \"DT\") { // OPENS if (left($linestr, 2) == \"DT\") // DT DD-MMM-YEAR (REL. XX, COMMENT) $datestr = $linedata; $datestr = substr($datestr, 0, strlen($datestr)-1); $words = preg_split(\"/\\(/\", $datestr); // ( \"DD-MMM-YEAR \", \"REL. XX, COMMENT\") $firstcomma = strpos($words[1], \",\"); $comment = trim(substr($words[1], $firstcomma+1)); // ( \"CREATED\" => (date, rel), \"LAST SEQUENCE UPDATE\" => (date, rel), // \"LAST ANNOTATION UPDATE\" => (date, rel), COMMENT1 => (date, rel), // \"COMMENT2\" => (date, rel), ... ) if ($comment == \"CREATED\") { // OPENS if ($comment == \"CREATED\") // this DT line is a DATE CREATED line. $create_date = substr($words[0], 0, 11); $create_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($create_date, $create_rel); } // CLOSES if ($comment == \"CREATED\") elseif ($comment == \"LAST SEQUENCE UPDATE\") { // OPENS elseif ($comment == \"LAST SEQUENCE UPDATE\") $sequpd_date = substr($words[0], 0, 11); $sequpd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($sequpd_date, $sequpd_rel); } // CLOSES elseif ($comment == \"LAST SEQUENCE UPDATE\") elseif ($comment == \"LAST ANNOTATION UPDATE\") { // OPENS elseif ($comment == \"LAST ANNOTATION UPDATE\") $notupd_date = substr($words[0], 0, 11); $notupd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($notupd_date, $notupd_rel); } // CLOSES elseif ($comment == \"LAST ANNOTATION UPDATE\") else { // OPENS else part of if ($comment == \"CREATED\") // For now, we do not check vs. duplicate comments. // We just overwrite the older comment with new one. $other_comment = $comment; $other_date = substr($words[0], 0, 11); $other_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($other_date, $other_rel); } // CLOSES else part of if ($comment == \"CREATED\") } // CLOSES if (left($linestr, 2) == \"DT\") if (left($linestr, 2) == \"DE\") { // OPENS if (left($linestr, 2) == \"DE\") $desc_lnctr++; $linestr = $linedata; if ($desc_lnctr == 1) $desc .= $linestr; else $desc .= \" \" . $linestr; // Checks if (FRAGMENT) or (FRAGMENTS) is found at the end // of the DE line to determine if sequence is complete. if (right($linestr, 1) == \".\") { // OPENS if (right($linestr, 1) == \".\") if ( (strtoupper(right($linestr, 11)) == \"(FRAGMENT).\") or (strtoupper(right($linestr, 12)) == \"(FRAGMENTS).\") ) $is_fragment = TRUE; else $is_fragment = FALSE; } // CLOSE if (right($linestr, 1) == \".\") } // CLOSES if (left($linestr, 2) == \"DE\") if ($linelabel == \"KW\") { $kw_str .= $linedata; if ($lineend == \".\") { $kw_str = rem_right($kw_str); $kw_r = preg_split(\"/;/\", $kw_str); array_walk($kw_r, \"trim_element\"); $kw_str = \"\"; } } if ($linelabel == \"OS\") { // OPENS if ($linelabel == \"OS\") $os_linectr++; if ($lineend != \".\") { // we are not yet at the last OS line. if ($os_linectr == 1) $os_str .= $linedata; else $os_str .= \" $linedata\"; } else { // we are at the last OS line. $os_str .= \" $linedata\"; $os_str = rem_right($os_str); $os_line = preg_split(\"/\\, AND /\", $os_str); } } // CLOSES if ($linelabel == \"OS\") if ($linelabel == \"OG\") $organelle = rem_right($linedata); if ($linelabel == \"OC\") { $oc_linectr++; if ($lineend != \".\") { // we are not yet at the last OS line. if ($oc_linectr == 1) $oc_str .= $linedata; else $oc_str .= \" $linedata\"; } else { // we are at the last OS line. $oc_str .= \" $linedata\"; $oc_str = rem_right($oc_str); $oc_line = preg_split(\"/;/\", $oc_str); array_walk($oc_line, \"trim_element\"); } } if ($linelabel == \"FT\") { $ft_key = trim(substr($linestr, 5, 8)); $ft_from = (int) trim(substr($linestr, 14, 6)); $ft_to = (int) trim(substr($linestr, 21, 6)); $ft_desc = rem_right(trim(substr($linestr, 34))); $ft_r[] = array($ft_key, $ft_from, $ft_to, $ft_desc); } // ( rn => ( \"rp\" => \"my rp\", \"rc\" => (\"tok1\" => \"value\", ...) ) ) // ( 10 => ( \"RP\" => \"my rp\", \"RC\" => (\"PLASMID\" => \"PLA_VAL\", ... ) ) ) // Example: DR AARHUS/GHENT-2DPAGE; 8006; IEF. if ($linelabel == \"DR\") { // DR DATA_BANK_IDENTIFIER; PRIMARY_IDENTIFIER; SECONDARY_IDENTIFIER // We assume that all three data items are mandatory/present in all DR entries. // ( refno => ( (dbname1, pid1, sid1), (dbname2, pid2, sid2), ... ), 1 => ( ... ) ) // ( 0 => ( (REBASE, pid1, sid1), (WORPEP, pid2, sid2), ... ), 1 => ( ... ) ) $linedata = rem_right($linedata); $dr_line = preg_split(\"/;/\", $linedata); array_walk($dr_line, \"trim_element\"); $db_name = $dr_line[0]; $db_pid = $dr_line[1]; $db_sid = $dr_line[2]; $db_r[] = array($db_name, $db_pid, $db_sid); } if ($linelabel == \"RN\") { // OPENS \"RN\" // Remove the [ and ] between the reference number. $refno = substr(rem_right($linedata), 1); $rc_ctr = 0; $rc_str = \"\"; $rc_flag = FALSE; $inner_r = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 2nd WHILE $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if ($linelabel == \"RP\") $inner_r[\"RP\"] = $linedata; elseif ($linelabel == \"RC\") { // OPENS elseif ($linelabel == \"RC\") $rc_str .= $linedata; while ( list($no, $linestr) = each($flines) ) { // OPENS 3rd WHILE $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if ($linelabel == \"RC\") $rc_str .= \" $linedata\"; else { // opens else prev($flines); break; } // closes else } // CLOSES 3rd WHILE // we remove the last character if it is \";\" $rc_str = trim($rc_str); if (right($rc_str,1) == \";\") $rc_str = rem_right($rc_str); $rc_line = preg_split(\"/;/\", trim($rc_str)); array_walk($rc_line, \"trim_element\"); $innermost = array(); foreach($rc_line as $tokval_str) { // here we assume that there is no whitespace // before or after (left or right of) the \"=\". $tokval_r = preg_split(\"/=/\", $tokval_str); $token = $tokval_r[0]; $value = $tokval_r[1]; $innermost[$token] = $value; } $inner_r[\"RC\"] = $innermost; } // CLOSES elseif ($linelabel == \"RC\") elseif ($linelabel == \"RM\") { // We have no idea what RM is about, so we assume it\'s a single-line entry. // which may occur 0 to 1 times inside a SWISSPROT SEQUENCE RECORD. $inner_r[\"RM\"] = $linedata; } elseif ($linelabel == \"RX\") { $linedata = rem_right($linedata); $rx_line = preg_split(\"/;/\", intrim($linedata)); $inner_r[\"RX_BDN\"] = $rx_line[0]; $inner_r[\"RX_ID\"] = $rx_line[1]; } elseif ($linelabel == \"RA\") { $ra_ctr++; if ($ra_ctr == 1) $ra_str = $linedata; else $ra_str .= \" $linedata\"; if ($lineend == \";\") { $ra_str = rem_right($ra_str); $ra_r = preg_split(\"/\\,/\", $ra_str); array_walk($ra_r, \"trim_element\"); $inner_r[\"RA\"] = $ra_r; } } elseif ($linelabel == \"RL\") { $rl_ctr++; if ($rl_ctr == 1) $rl_str = $linedata; else $rl_str .= \" $linedata\"; } else { $inner_r[\"RL\"] = $rl_str; prev($flines); break; } } // CLOSES 2nd WHILE $ref_r[$refno-1] = $inner_r; $ra_str = \"\"; $ra_ctr = 0; $rl_str = \"\"; $rl_ctr = 0; } // CLOSES \"RN\" if (left($linestr, 2) == \"GN\") { // OPENS if (left($linestr, 2) == \"GN\") // GN is always exactly one line. // GNAME1 OR GNAME2 ( (GNAME1, GNAME2) ) // GNAME1 AND GNAME2 ( (GNAME1), (GNAME2) ) // GNAME1 AND (GNAME2 OR GNAME3) ( (GNAME1), (GNAME2, GNAME3) ) // GNAME1 OR (GNAME2 AND GNAME3) NOT POSSIBLE!!! /* ALGORITHM: 1) Split expressions by \" AND \". 2) Test each \"token\" if in between parentheses or not. 3) If not, then token is a singleton, else it\'s a multiple-ton. 4) Singletons are translated into (GNAME1). Multiple-tons are translated into (GNAME1, GNAME 2). 5) Push gene name array into larger array. Go to next token. */ // Remove \"GN \" at the beginning of our line. $linestr = trim(substr($linestr, 5)); // Remove the last character which is always a period. $linestr = substr($linestr, 0, strlen($linestr)-1); // Go here if you detect at least one ( or ). if ( is_false(strpos($linestr, \"(\")) ) { // GN Line does not contain any parentheses. // Ergo, it is made up of all OR\'s or AND\'s but not both. if (strpos($linestr, \" OR \") != FALSE) { // Case 1: GNAME1 OR GNAME2. $temp = preg_split(\"/ OR /\", $linestr); $gename_r[] = $temp; } elseif (strpos($linestr, \" AND \") != FALSE) { // Case 2: GNAME1 AND GNAME2 AND GNAME3. $temp = preg_split(\"/ AND /\", $linestr); foreach($temp as $gene) $gename_r[] = array($gene); } else $gename_r[] = array($linestr); // Case 0: GN GENENAME1. One gene name (no OR, AND). } else { // OPENS else part of if ( is_false(strpos($linestr, \"(\")) ) // GN Line contains at least one pair of parentheses. // Case 3: GNAME1 AND (GNAME2 OR GNAME3) => ( (GNAME1), (GNAME2, GNAME3) ) // COMMENTS # 1 below. $temp = preg_split(\"/ AND /\", $linestr); foreach($temp as $gene) { // OPENS foreach($temp as $gene) if (substr($gene, 0, 1) == \"(\") { // a list of 2 or more gene names OR\'ed together // remove the \"(\" and \")\" at both ends of the string. $gene = substr($gene, 1); $gene = substr($gene, 0, strlen($gene)-1); $genelist = preg_split(\"/ OR /\", $gene); $gename_r[] = $genelist; } else { // singleton $gename_r[] = array($gene); } } // CLOSES foreach($temp as $gene) } // CLOSES else part of if ( is_false(strpos($linestr, \"(\")) ) } // CLOSES if (left($linestr, 2) == \"GN\") // 0123456789012345678901234567890123456789 // SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CN; if ($linelabel == \"SQ\") { // OPENS if ($linelabel == \"SQ\") $linedata = rem_right($linedata); // XXXX AA, XXXX MW, XXXX CN $words = preg_split(\"/;/\", substr($linedata, 8)); $aa = preg_split(\"/\\s+/\", trim($words[0])); $aa_count = (int) trim($aa[0]); $mw = preg_split(\"/\\s+/\", trim($words[1])); $mol_wt = (int) trim($mw[0]); $cn = preg_split(\"/\\s+/\", trim($words[2])); $chk_no = trim($cn[0]); $chk_method = trim($cn[1]); $sequence = \"\"; while ( list($no, $linestr) = each($flines) ) { $linelabel = left($linestr, 2); if ($linelabel == \"//\") break; $linedata = intrim(trim($linestr)); $sequence .= $linedata; } } // CLOSES if ($linelabel == \"SQ\") } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $seqobj = new seq(); $seqobj->id = $protein_name; $seqobj->seqlength = $length; $seqobj->moltype = $moltype; $seqobj->date = $create_date; $seqobj->accession = $accession[0]; array_shift($accession); $seqobj->sec_accession = $accession; $seqobj->source = $os_line; $seqobj->organism = $oc_line; $seqobj->sequence = $sequence; $seqobj->definition = $desc; $seqobj->keywords = $kw_r; $genbank_ref_r = array(); $inner_r = array(); foreach($ref_r as $key => $value) { $inner_r[\"REFNO\"] = $key; $db_id = $value[\"RX_BDN\"]; $inner_r[$db_id] = $value[\"RX_ID\"]; $inner_r[\"REMARKS\"] = $value[\"RP\"]; $inner_r[\"COMMENT\"] = $value[\"RC\"]; $inner_r[\"TITLE\"] = $value[\"RL\"]; $inner_r[\"JOURNAL\"] = $value[\"RL\"]; $inner_r[\"AUTHORS\"] = $value[\"RA\"]; $genbank_ref_r[] = $inner_r; } $seqobj->reference = $genbank_ref_r; $swiss = array(); $swiss[\"ID\"] = $protein_name; $swiss[\"PROT_NAME\"] = $protein_name; $swiss[\"MOL_TYPE\"] = $moltype; $swiss[\"PROT_SOURCE\"] = $protein_source; $swiss[\"DATA_CLASS\"] = $data_class; $swiss[\"LENGTH\"] = $length; $swiss[\"CREATE_DATE\"] = $create_date; $swiss[\"CREATE_REL\"] = $create_rel; $swiss[\"SEQUPD_DATE\"] = $sequpd_date; $swiss[\"SEQUPD_REL\"] = $sequpd_rel; $swiss[\"NOTUPD_DATE\"] = $notupd_date; $swiss[\"NOTUPD_REL\"] = $notupd_rel; // ACCESSION is an ARRAY. $swiss[\"ACCESSION\"] = $accession; $swiss[\"PRIM_AC\"] = $accession[0]; $swiss[\"DESC\"] = $desc; $swiss[\"IS_FRAGMENT\"] = $is_fragment; // KEYWORDS is an ARRAY. $swiss[\"KEYWORDS\"] = $kw_r; // ORGANISM is an ARRAY. $swiss[\"ORGANISM\"] = $os_line; $swiss[\"ORGANELLE\"] = $organelle; // FT_<keyword> is an ARRAY. process_ft($swiss, $ft_r); $swiss[\"AMINO_COUNT\"] = $aa_count; $swiss[\"MOLWT\"] = $mol_wt; $swiss[\"CHK_NO\"] = $chk_no; $swiss[\"CHK_METHOD\"] = $chk_method; $swiss[\"SEQUENCE\"] = $sequence; // GENE_NAME is an ARRAY. $swiss[\"GENE_NAME\"] = $gename_r; // ORG_CLASS is an ARRAY. $swiss[\"ORG_CLASS\"] = $oc_line; // REFERENCE is an ARRAY. $swiss[\"REFERENCE\"] = $ref_r; $seqobj->swissprot = $swiss; // ARRAY return $seqobj; } // CLOSES parse_swissprot() // parse_id() parses a GenBank data file and returns a Seq object containing parsed data. function parse_id($flines) { $seqarr = array(); $inseq_flag = false; $seqdata_flag = false; $accession_flag = false; $ref_array = array(); $feature_array = array(); $entry_ctr = 0; $ref_ctr = 0; $maxlength = 0; $minlength = 999999; $tot_seqlength = 0; while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while( list($lineno, $linestr) = each($flines) ) if (substr($linestr,0,5) == \"LOCUS\") { $entry_ctr++; $ref_ctr = 0; $ref_array = array(); // This is the beginning of a SEQUENCE ENTRY. $seqdata = \"\"; $seqobj = new seq(); $seqobj->id = trim(substr($linestr, 12, 16)); $seqobj->seqlength = trim(substr($linestr, 29, 11)) * 1; $tot_seqlength += $seqobj->seqlength; if ($seqobj->seqlength > $maxlength) $maxlength = $seqobj->seqlength; if ($seqobj->seqlength < $minlength) $minlength = $seqobj->seqlength; $seqobj->moltype = substr($linestr, 47, 6); if (substr($linestr, 44, 3) == \"ss-\") $seqobj->strands = \"SINGLE\"; elseif (substr($linestr, 44, 3) == \"ds-\") $seqobj->strands = \"DOUBLE\"; elseif (substr($linestr, 44, 3) == \"ms-\") $seqobj->strands = \"MIXED\"; $seqobj->topology = strtoupper(substr($linestr, 55, 8)); $seqobj->division = strtoupper(substr($linestr, 64, 3)); $seqobj->date = strtoupper(substr($linestr, 68, 11)); $inseq_flag = true; } if (trim(substr($linestr,0,10)) == \"BASE COUNT\") { if (count($feat_r) > 0) $seqobj->features = $feat_r; } if (trim(substr($linestr,0,12)) == \"FEATURES\") { // OPENS if (trim(substr($linestr,0,12)) == \"FEATURES\") // The REFERENCE section was present for this SEQUENCE ENTRY so we set REFERENCE attribute. if (count($ref_array) > 0) $seqobj->reference = $ref_array; $lastsubkey = \"\"; $feat_r = array(); $qual_r = array(); // Go to the next line. list($lineno, $linestr) = each($flines); // This loops through each line in the entire FEATURES SECTION. while( substr($linestr,0,10) != \"BASE COUNT\" ) { // FEATURES WHILE LOOP $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); if (strlen($label) != 0) { // At the beginning of a new SUBKEY. $subkey = $label; // Add/save the qualifier array (qual_r) of the previous SUBKEY to our big feat_r array. if (count($qual_r) > 0) { $feat_r[$lastsubkey] = $qual_r; $qual_r = array(); } $qual = $subkey; $qual_r[$qual] = \"\"; $qual_ctr = 0; do { // QUALIFIER WHILE LOOP $qual_ctr++; $qual_r[$qual] .= \" \" . $data; list($lineno, $linestr) = each($flines); $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); } while( is_blank($label) and !(isa_qualifier($data)) ); if (!(is_blank($label))) { $lastsubkey = $subkey; $subkey = $label; } } else { // we are inside a subkey section but on the 2nd, 3rd, nth line which have blank LABELS. if (isa_qualifier($data)) { $wordarray = preg_split(\"/=/\", $data); $qual = $wordarray[0]; $data = $wordarray[1]; $qual_r[$qual] = \"\"; $qual_ctr = 0; do { // QUALIFIER WHILE LOOP $qual_ctr++; $qual_r[$qual] .= \" \" . $data; list($lineno, $linestr) = each($flines); $label = trim(substr($linestr,0,21)); $data = trim(substr($linestr,21)); } while( is_blank($label) and !(isa_qualifier($data)) ); if (!(is_blank($label))) { $lastsubkey = $subkey; $subkey = $label; } } } // ELSE PART of if (strlen($subkey) != 0) } // FEATURES WHILE LOOP if (count($qual_r) > 0) { $feat_r[$lastsubkey] = $qual_r; $qual_r = array(); } prev($flines); } // CLOSES if (trim(substr($linestr,0,12)) == \"FEATURES\") if (substr($linestr,0,10) == \"DEFINITION\") { $wordarray = explode(\" \", $linestr); array_shift($wordarray); $seqobj->definition = implode(\" \", $wordarray); } if ($inseq_flag == TRUE) { // OPENS if ($inseq_flag == TRUE) if (trim(substr($linestr, 0, 12)) == \"REFERENCE\") { // at this point, we are at the line with REFERENCE x (base y of z) in it. $wordarray = preg_split(\"/\\s+/\", trim(substr($linestr,12))); $ref_rec = array(); $ref_rec[\"REFNO\"] = $wordarray[0]; array_shift($wordarray); $ref_rec[\"BASERANGE\"] = implode(\" \", $wordarray); $lastsubkey = \"\"; $subkey_lnctr = 0; while( list($lineno, $linestr) = each($flines) ) { $subkey = trim(substr($linestr,0,12)); // If current subkey is blank string, then this is a continuation of the last subsection. if (strlen($subkey) == 0) $subkey = $lastsubkey; // If we are at the next subkey section (e.g. lastsubkey was AUTHORS, and current is TITLE). if ($subkey != $lastsubkey) $subkey_lnctr = 0; switch ($subkey) { case \"AUTHORS\": $subkey_lnctr++; $wordarray = preg_split(\"/\\s+/\", trim(substr($linestr,12))); // we remove comma at the end of a name, and the element \"and\". $newarray = array(); foreach($wordarray as $authname) { if (strtoupper($authname) != \"AND\") { if (substr($authname, strlen($authname)-1, 1) == \",\") $authname = substr($authname, 0, strlen($authname)-1); $newarray[] = $authname; } } if ($subkey_lnctr == 1) $ref_rec[\"AUTHORS\"] = $newarray; else $ref_rec[\"AUTHORS\"] = array_merge($ref_rec[\"AUTHORS\"], $newarray); break; case \"TITLE\": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec[\"TITLE\"] = trim(substr($linestr,12)); else $ref_rec[\"TITLE\"] .= \" \" . trim(substr($linestr,12)); break; case \"JOURNAL\": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec[\"JOURNAL\"] = trim(substr($linestr,12)); else $ref_rec[\"JOURNAL\"] .= \" \" . trim(substr($linestr,12)); break; case \"MEDLINE\": $ref_rec[\"MEDLINE\"] = substr($linestr, 12, 8); break; case \"PUBMED\": $ref_rec[\"PUBMED\"] = substr($linestr, 12, 8); break; case \"REMARK\": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec[\"REMARK\"] = trim(substr($linestr,12)); else $ref_rec[\"REMARK\"] .= \" \" . trim(substr($linestr,12)); break; case \"COMMENT\": $subkey_lnctr++; if ($subkey_lnctr == 1) $ref_rec[\"COMMENT\"] = trim(substr($linestr,12)); else $ref_rec[\"COMMENT\"] .= \" \" . trim(substr($linestr,12)); break; } if ($subkey == \"FEATURES\") { prev($flines); break; } if ($subkey == \"REFERENCE\") { $ref_ctr++; prev($flines); break; } $lastsubkey = $subkey; } array_push($ref_array, $ref_rec); } if (trim(substr($linestr, 0, 12)) == \"SOURCE\") { // For now, assume a single-line SOURCE field. $seqobj->source = substr($linestr, 12); } if (trim(substr($linestr, 0, 12)) == \"SEGMENT\") { $seqobj->segment = substr($linestr, 12); $wordarray = preg_split(\"/\\s+/\", trim(substr($linestr,12))); $seqobj->segment_no = $wordarray[0]; $seqobj->segment_count = $wordarray[2]; } // For now, assume that KEYWORDS field consists of exactly one line. if (trim(substr($linestr, 0, 12)) == \"KEYWORDS\") { $wordarray = preg_split(\"/\\s+/\", trim($linestr)); array_shift($wordarray); $wordarray = preg_split(\"/;+/\", implode(\" \", $wordarray)); if ($wordarray[0] != \".\") $seqobj->keywords = $wordarray; } if (substr($linestr, 0, 7) == \"VERSION\") { // Assume that VERSION line is made up of exactly 2 or 3 tokens. $wordarray = preg_split(\"/\\s+/\", trim($linestr)); $seqobj->version = $wordarray[1]; if (count($wordarray) == 3) $seqobj->ncbi_gi_id = $wordarray[2]; $accession_flag = false; } if ($accession_flag == TRUE) { // 2nd, 3rd, etc. line of ACCESSION field. $wordarray = preg_split(\"/\\s+/\", trim($linestr)); $this->sec_accession = array_merge($this->sec_accession, $wordarray); } if (substr($linestr,0,9) == \"ACCESSION\") { $wordarray = preg_split(\"/\\s+/\", trim($linestr)); $seqobj->accession = $wordarray[1]; array_shift($wordarray); array_shift($wordarray); $seqobj->sec_accession = $wordarray; $accession_flag = true; } if (substr($linestr,0,10) == \" ORGANISM\") { $seqobj->organism = substr($linestr,12); } if (($seqdata_flag == true) && (substr($linestr,0,2) != \"//\")) { $wordarray = explode(\" \", trim($linestr)); array_shift($wordarray); $seqline = implode(\"\", $wordarray); $seqdata .= $seqline; } if (substr($linestr,0,6) == \"ORIGIN\") $seqdata_flag = true; if (substr($linestr,0,2) == \"//\") { $seqobj->sequence = $seqdata; $seqarr[$this->id] = $this; $seqdata_flag = false; $inseq_flag = false; break; } } // CLOSES if ($inseq_flag == TRUE) } // CLOSES outermost while( list($lineno, $linestr) = each($flines) ) $seqobj->seqarray = $seqarr; return $seqobj; } // Closes parse_id() constructor function definition // open() opens or prepares the SeqDB for processing. Opposite of close(). function open($dbname) { if (file_exists($dbname . \".idx\") == FALSE) die(\"ERROR: Index file $dbname.IDX does not exist!\"); if (file_exists($dbname . \".dir\") == FALSE) die(\"ERROR: Index file $dbname.DIR does not exist!\"); $this->dbname = $dbname; $this->data_fn = $dbname . \".idx\"; $this->dir_fn = $dbname . \".dir\"; $this->seqptr = 0; } // close() closes the SeqDB database after we\'re through using it. Opposite of open() method. function close() { // // Close simply assigns null values to attributes of the seqdb() object. // Methods like fetch would not function properly if these values are null. $this->dbname = \"\"; $this->data_fn = \"\"; $this->dir_fn = \"\"; $this->seqptr = -1; } /* SeqDB() is the constructor method for the SeqDB class. It does many things like create and/or read a database\'s index files, initialize certain SeqDB properties, etc. Syntax: $seqdb = new seqdb($dbname, $dbformat, $file1, $file2, ...); Behavior: if $dbname exists and user gave no specific values for $file1, $file2, ... then seqdb() object USES/OPENS existing database (index files). if $dbname exists and user gave specific values for $file1, $file2, ... then seqdb() object OVERWRITES existing database (index files). if $dbname does not exist, then seqdb() object CREATES new database. even if $file1, $file2, ... are not specified. We provide the create() method to explicitly create a new database. We provide the use() or open() method to explicitly use an existing database. */ function SeqDB() { // Get all the arguments passed to this function. $args = func_get_args(); $dbname = $args[0]; $dbformat = strtoupper($args[1]); if (strlen($dbformat) == 0) $dbformat = \"GENBANK\"; $this->dbformat = $dbformat; $datafile = array(); for($i = 2; $i < count($args); $i++) $datafile[] = $args[$i]; /* db exists fileX args ACTION TESTED Y Y create okay Y N use N Y create okay N N create okay */ // if user provided specific values for $file1, $file2, ... parameters. if ((file_exists($dbname)) and (count($datafile) > 0)) { // For now, assume USING/OPENING a database is to be done in READ ONLY MODE. $this->open($dbname); } else { // March 26, 2003: I switched the 2 lines below with // the line: $this->open($dbname); to avoid die-ing with // the error message: \"ERROR: Index file does not exist!\" $fp = fopen($dbname . \".idx\", \"w+\"); $fpdir = fopen($dbname . \".dir\", \"w+\"); // Creates blank data and directory index files, and sets seqptr to 0, etc. $this->open($dbname); // if user did not provide any datafile name. if (count($datafile) == 0) return; $temp_r = array(); // Build our *.DIR file foreach($datafile as $fileno=>$filename) { $outline = \"$fileno $filename\\n\"; fputs($fpdir, $outline); // Automatically create an index file containing info across all data files. $flines = file($filename); $totlines = count($flines); while( list($lineno, $linestr) = each($flines) ) { // if (substr($linestr,0,5) == \"LOCUS\") if (at_entrystart($linestr, $dbformat)) { // $current_id = trim(substr($linestr, 12, 16)); $current_id = get_entryid($flines, $linestr, $dbformat); $outline = \"$current_id $fileno $lineno\\n\"; // Put entries in an array first, sort them, then write to *.IDX file. // temp_r = (\"AB1234\" => (\"AB1234\", 1, 12), \"BC4321\" => ... ); $temp_r[$current_id] = array($current_id, $fileno, $lineno); } } // Sort our array by its keys. ksort($temp_r); } // Build our *.IDX array. $this->seqcount = count($temp_r); foreach($temp_r as $seqid=>$line_r) { $outline = $line_r[0] . \" \" . $line_r[1] . \" \" . $line_r[2] . \"\\n\"; $fio = fputs($fp, $outline); } } fclose($fp); fclose($fpdir); } // CLOSES definition of SeqDB constructor function. } // CLOSES definition of SEQDB CLASS. ?>