BioPHP - Sequence Database Class

BioPHP - Sequence Database Class (original)

Original code submitted by joseba
Code bellow is covered by GNU GPL v2 license.

Description

Last change: 2005/12/29 14:38 | Recent Changes

(short for \"Sequence Database\") Contains code for the SeqDB Class.

Code

Last change: 2005/12/29 14:38 | Download original | Recent Changes | Original code

<?php
require_once(\"etc.inc\");
require_once(\"seq.inc\");
 
// ================== FUNCTIONS ========================

/*
We begin by describing parse_swissprot() first.

parse_swissprot() parses the Feature Table lines (those that begin with FT) in a Swissprot
data file, extracts the feature key name, from endpoint, to endpoint, and description, and
stores them in a (simple) array.  

process_ft() then pushes this array into a larger associative array, called $swiss, which is 
also an attribute of the Seq object. It is assigned a key of the form: FT_<feature_key_name>.
Examples are: FT_PEPTIDE, FT_DISULFID.
*/
function process_ft(&$swiss, $ft_r)
        {
        foreach($ft_r as $element)
                {
                $index = \"FT_\" . $element[0];
                array_shift($element);                                        
                if (count($swiss[$index]) == 0) 
                        {
                        $swiss[$index] = array();
                        array_push($swiss[$index], $element);
                        }
                else array_push($swiss[$index], $element); 
                }
        }

// at_entrystart() tests if the file pointer is at the start of a new sequence entry. 
function at_entrystart($linestr, $dbformat)
        {
        if ($dbformat == \"GENBANK\")
                return (substr($linestr,0,5) == \"LOCUS\");
        elseif ($dbformat == \"SWISSPROT\")
                return (substr($linestr,0,2) == \"ID\");
        }

// get_entryid() gets the primary accession number of the sequence entry which we are 
// currently processing.  This uniquely identifies a sequence entry.
function get_entryid(&$flines, $linestr, $dbformat)
        {
        if ($dbformat == \"GENBANK\")
                return trim(substr($linestr, 12, 16));
        elseif ($dbformat == \"SWISSPROT\")
                {
                list($lineno, $linestr) = each($flines);
                if (substr($linestr,0,2) == \"AC\")
                        {
                        $words = preg_split(\"/;/\", intrim(substr($linestr,5)));
                        prev($flines);
                        return $words[0];
                        }
                }
        }

// line2r() copies the lines belonging to a single sequence entry into an array.
function line2r($fpseq)
        {
        $flines = array();
        while(1)
                {
                $linestr = fgets($fpseq, 101);
                $flines[] = $linestr;
                if (left($linestr,2) == \'//\') return $flines;
                }
        return FALSE;
        }

// isa_qualifier() tests if the file pointer is at a line containing a feature qualifier.
// This applies only to GenBank sequence files.  
function isa_qualifier($str)
        {
        if (firstchar($str) == \'/\') return true;
        else return false;
        }

// fseekline() gets the byte offset (from beginning of file) of a particular line.  The file is
// identified by $fp file pointer, while the line is identified by $lineno, which is zero-based.
function fseekline($fp, $lineno)
        {
        $linectr = 0;
        fseek($fp, 0);
        while(!feof($fp))
                {
                $linestr = fgets($fp,101);
                if ($linectr == $lineno)
                        {
                        fseek($fp, $byteoff);
                        return $byteoff;
                        }
                $linectr++;
                $byteoff = ftell($fp);
                }
        }

// bsrch_tabfile() searches for a particular sequence id ($seqid) within an *.IDX file
// (identified by $fp file pointer), and returns data located in its $col-th column.
function bsrch_tabfile($fp, $col, $seqid)
        {
        $linectr = 0;
        fseek($fp, 0);
        while(!feof($fp))
                {
                fgets($fp, 41);
                $linectr++;
                }
        $lastline = $linectr;
        rewind($fp);

        if ($fp == FALSE) die(\"CANT OPEN FILE\");
        $searchspace = $lastline;
        $floor = 0;
        $ceiling = $lastline - 1;

        while(1)
                {
                $offset = ((int) ($searchspace/2));
                $lineno = $floor + $offset;

                fseekline($fp, $lineno);
                $word = preg_split(\"/\\s+/\", trim(fgets($fp,81)));
                if ($word[$col] == $seqid)
                        {
                        $word[] = $lineno;
                        return $word;
                        }
                elseif ($seqid > $word[$col])
                        {
                        $floor = $lineno + 1;
                        $searchspace = $ceiling - $floor + 1;
                        if ($searchspace <= 0) return FALSE;
                        }
                else
                        {
                        $ceiling = $lineno - 1;
                        $searchspace = $ceiling - $floor + 1;
                        if ($searchspace <= 0) return FALSE;
                        }
                }
        // fclose($fpidx);
        }

// ================== CLASSES ========================

class SeqDB
{ // OPENS definition of SEQDB CLASS.
var $dbname;
var $data_fn;
var $data_fp;
var $dir_fn;
var $dir_fp;
var $seqptr;
var $seqcount;
var $dbformat;
var $bof;
var $eof;

// We need the functions bof() and eof() to determine if we\'ve reached the end of
// file or not.
// Two ways of doing this: 1) examine value of seqptr, or 2) maintain boolean variables eof and bof

// first() positions the sequence pointer (i.e. the seqptr property of a Seq object) to 
// the first sequence in a database (SeqDB object).
function first()
        {
        $this->seqptr = 0;
        }

// last() positions the sequence pointer (i.e. the seqptr property of a Seq object) to 
// the last sequence in a database (SeqDB object).
function last()
        {
        $this->seqptr = $this->seqcount-1;
        }

// prev() (short for previous) positions the sequence pointer (i.e. the seqptr property of
// a Seq object) to the sequence that comes before the current sequence.  
function prev()
        {
        if ($this->seqptr > 0) $this->seqptr--;
        else $this->bof = TRUE;
        }

// next() positions the sequence pointer (i.e. the seqptr property of a Seq object) to the 
// sequence that comes after the current sequence.  
function next()
        {
        if ($this->seqptr < $this->seqcount-1) $this->seqptr++;
        else $this->eof = TRUE;
        }
        
// fetch() retrieves all data from the specified sequence record and returns them in the 
// form of a Seq object.  This method invokes one of several parser methods.
function fetch()
        {
        if ($this->data_fn == \"\") die(\"Cannot invoke fetch() method from a closed object.\");
        @$seqid = func_get_arg(0);

        // IDX and DIR files remain open for the duration of the FETCH() method.
        $fp = fopen($this->data_fn, \"r\");
        $fpdir = fopen($this->dir_fn, \"r\");

        if ($seqid != FALSE)
                {
                $idx_r = bsrch_tabfile($fp, 0, $seqid);
                if ($idx_r == FALSE) return FALSE;
                else $this->seqptr = $idx_r[3];
                }
        else
                 {
                 // For now, SEQPTR determines CURRENT SEQUENCE ID.  Alternative is to track curr line.
                 fseekline($fp, $this->seqptr);
                 $idx_r = preg_split(\"/\\s+/\", trim(fgets($fp, 81)));
                 }
        $dir_r = bsrch_tabfile($fpdir, 0, $idx_r[1]);

        $fpseq = fopen($dir_r[1], \"r\");
        fseekline($fpseq, $idx_r[2]);

        $flines = line2r($fpseq);

        $myseq = new seq();
        if ($this->dbformat == \"GENBANK\")
                $myseq = $this->parse_id($flines);
        elseif ($this->dbformat == \"SWISSPROT\")
                $myseq = $this->parse_swissprot($flines);

        fclose($fp);
        fclose($fpdir);
        fclose($fpseq);

        return $myseq;
        }

// parse_swissprot() parses a Swissprot data file and returns a Seq object containing parsed data.
function parse_swissprot($flines)
        { // OPENS parse_swissprot() function
        $accession = array();
        $date_r = array();
        $desc = \"\";
        $desc_lnctr = 0;
        $gename_r = array();
        $os_r = array();
        $os_linectr = 0;
        $os_str = \"\";
        $oc_linectr = 0;
        $oc_str = \"\";
        $ref_r = array();
        $ra_r = array();
        $ra_ctr = 0;
        $ra_str = \"\";
        $rl_ctr = 0;
        $rl_str = \"\";
        $db_r = array();
        $ft_r = array();
        $kw_str = \"\";
        $kw_r = array();
        
        while ( list($no, $linestr) = each($flines) )
                { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) 
                  $linelabel = left($linestr, 2);
                  $linedata = trim(substr($linestr, 5));
                  $lineend = right($linedata, 1);
                  if (left($linestr, 2) == \"ID\")
                        { // OPENS if (left($linestr, 2) == \"ID\") 
                        $words = preg_split(\"/;/\", substr($linestr, 5));    
                        $endc = preg_split(\"/\\s/\", $words[0]);
                        $entry_name = $endc[0];
                        $namesrc = preg_split(\"/_/\", $entry_name);  
                        $protein_name = $namesrc[0];
                        $protein_source = $namesrc[1]; 
                        $data_class = $endc[1]; 
                        $moltype = $words[1];
                        $length = (int) substr($words[2], 0, strlen($words[2])-4);         
                        } // CLOSES if (left($linestr, 2) == \"ID\")
                if (left($linestr, 2) == \"AC\")
                        { // OPENS if (left($linestr, 2) == \"AC\") 
                        $accstr = $linedata;
                        $accstr = substr($accstr, 0, strlen($accstr)-1);
                        $accline = preg_split(\"/;/\", intrim($accstr));                    
                        $accession = array_merge($accession, $accline);
                        } // CLOSES if (left($linestr, 2) == \"AC\") 
                if (left($linestr, 2) == \"DT\")
                        { // OPENS if (left($linestr, 2) == \"DT\") 
                        // DT DD-MMM-YEAR (REL. XX, COMMENT)
                        $datestr = $linedata;
                        $datestr = substr($datestr, 0, strlen($datestr)-1); 
                        $words = preg_split(\"/\\(/\", $datestr);        
                        // ( \"DD-MMM-YEAR \", \"REL. XX, COMMENT\")
                        $firstcomma = strpos($words[1], \",\");
                        $comment = trim(substr($words[1], $firstcomma+1));
                  
                        // ( \"CREATED\" => (date, rel), \"LAST SEQUENCE UPDATE\" => (date, rel),
                        //   \"LAST ANNOTATION UPDATE\" => (date, rel), COMMENT1 => (date, rel),
                        //   \"COMMENT2\" => (date, rel), ... )
                  
                        if ($comment == \"CREATED\")
                                { // OPENS if ($comment == \"CREATED\") 
                                // this DT line is a DATE CREATED line.
                                $create_date = substr($words[0], 0, 11); 
                                $create_rel = substr($words[1], 5, ($firstcomma-5));
                                $date_r[$comment] = array($create_date, $create_rel);  
                                } // CLOSES if ($comment == \"CREATED\")
                        elseif ($comment == \"LAST SEQUENCE UPDATE\")
                                { // OPENS elseif ($comment == \"LAST SEQUENCE UPDATE\")
                                $sequpd_date = substr($words[0], 0, 11);
                                $sequpd_rel = substr($words[1], 5, ($firstcomma-5));
                                $date_r[$comment] = array($sequpd_date, $sequpd_rel);  
                                } // CLOSES elseif ($comment == \"LAST SEQUENCE UPDATE\")
                        elseif ($comment == \"LAST ANNOTATION UPDATE\")
                                { // OPENS elseif ($comment == \"LAST ANNOTATION UPDATE\")
                                $notupd_date = substr($words[0], 0, 11);
                                $notupd_rel = substr($words[1], 5, ($firstcomma-5));
                                $date_r[$comment] = array($notupd_date, $notupd_rel);  
                                } // CLOSES elseif ($comment == \"LAST ANNOTATION UPDATE\")
                        else
                                { // OPENS else part of if ($comment == \"CREATED\")  
                                // For now, we do not check vs. duplicate comments.
                                // We just overwrite the older comment with new one.
                                $other_comment = $comment; 
                                $other_date = substr($words[0], 0, 11);
                                $other_rel = substr($words[1], 5, ($firstcomma-5));
                                $date_r[$comment] = array($other_date, $other_rel);
                                } // CLOSES else part of if ($comment == \"CREATED\")
                        } // CLOSES if (left($linestr, 2) == \"DT\")    

                        if (left($linestr, 2) == \"DE\")
                                { // OPENS if (left($linestr, 2) == \"DE\") 
                                $desc_lnctr++;
                                $linestr = $linedata;
                                if ($desc_lnctr == 1) $desc .= $linestr;
                                else $desc .= \" \" . $linestr;
                                
                                // Checks if (FRAGMENT) or (FRAGMENTS) is found at the end
                                // of the DE line to determine if sequence is complete.                                  
                                if (right($linestr, 1) == \".\")
                                        { // OPENS if (right($linestr, 1) == \".\")
                                        if (         (strtoupper(right($linestr, 11)) == \"(FRAGMENT).\") or 
                                                (strtoupper(right($linestr, 12)) == \"(FRAGMENTS).\") )
                                                $is_fragment = TRUE;
                                        else $is_fragment = FALSE;
                                        } // CLOSE if (right($linestr, 1) == \".\")
                                } // CLOSES if (left($linestr, 2) == \"DE\")
                          
                        if ($linelabel == \"KW\")
                                {
                                $kw_str .= $linedata;                                  
                                if ($lineend == \".\")
                                        {                                             
                                        $kw_str = rem_right($kw_str);
                                        $kw_r = preg_split(\"/;/\", $kw_str);
                                        array_walk($kw_r, \"trim_element\");                                          
                                        $kw_str = \"\";
                                        }                                  
                                }          
                          
                        if ($linelabel == \"OS\")                          
                                { // OPENS if ($linelabel == \"OS\") 
                                $os_linectr++;
                                if ($lineend != \".\")
                                        { // we are not yet at the last OS line.
                                        if ($os_linectr == 1) $os_str .= $linedata;
                                        else $os_str .= \" $linedata\";
                                        }
                                else 
                                        { // we are at the last OS line.
                                        $os_str .= \" $linedata\";
                                        $os_str = rem_right($os_str);
                                        $os_line = preg_split(\"/\\, AND /\", $os_str);
                                        }
                                } // CLOSES if ($linelabel == \"OS\")

                        if ($linelabel == \"OG\")
                                $organelle = rem_right($linedata);
                                
                        if ($linelabel == \"OC\")
                                {
                                $oc_linectr++;
                                if ($lineend != \".\")
                                        { // we are not yet at the last OS line.
                                        if ($oc_linectr == 1) $oc_str .= $linedata;
                                        else $oc_str .= \" $linedata\";
                                        }
                                else 
                                        { // we are at the last OS line.
                                        $oc_str .= \" $linedata\";
                                        $oc_str = rem_right($oc_str);
                                        $oc_line = preg_split(\"/;/\", $oc_str);
                                        array_walk($oc_line, \"trim_element\");
                                        }
                                }                  
                                
                        if ($linelabel == \"FT\")
                                {
                                $ft_key = trim(substr($linestr, 5, 8));                                  
                                $ft_from = (int) trim(substr($linestr, 14, 6));
                                $ft_to = (int) trim(substr($linestr, 21, 6));
                                $ft_desc = rem_right(trim(substr($linestr, 34)));
                                $ft_r[] = array($ft_key, $ft_from, $ft_to, $ft_desc);
                                }  
                          
                  // ( rn => ( \"rp\" => \"my rp\", \"rc\" => (\"tok1\" => \"value\", ...) ) )
                  // ( 10 => ( \"RP\" => \"my rp\", \"RC\" => (\"PLASMID\" => \"PLA_VAL\", ... ) ) )
                  // Example: DR AARHUS/GHENT-2DPAGE; 8006; IEF.
                  
                        if ($linelabel == \"DR\")
                                {
                                // DR DATA_BANK_IDENTIFIER; PRIMARY_IDENTIFIER; SECONDARY_IDENTIFIER
                                // We assume that all three data items are mandatory/present in all DR entries.
                                // ( refno => ( (dbname1, pid1, sid1), (dbname2, pid2, sid2), ... ), 1 => ( ... ) )
                                // ( 0 => ( (REBASE, pid1, sid1), (WORPEP, pid2, sid2), ... ), 1 => ( ... ) )
                                $linedata = rem_right($linedata);
                                $dr_line = preg_split(\"/;/\", $linedata); 
                                array_walk($dr_line, \"trim_element\");
                                $db_name = $dr_line[0];
                                $db_pid = $dr_line[1];
                                $db_sid = $dr_line[2];
                                $db_r[] = array($db_name, $db_pid, $db_sid);                   
                                }                          
                  
                        if ($linelabel == \"RN\")
                                { // OPENS \"RN\"
                                // Remove the [ and ] between the reference number.
                                $refno = substr(rem_right($linedata), 1);
                                
                                $rc_ctr = 0;
                                $rc_str = \"\";
                                $rc_flag = FALSE;
                                $inner_r = array();
                                while ( list($no, $linestr) = each($flines) )
                                        { // OPENS 2nd WHILE
                                        $linelabel = left($linestr, 2);
                                        $linedata = trim(substr($linestr, 5));
                                        $lineend = right($linedata, 1);
                                        if ($linelabel == \"RP\") $inner_r[\"RP\"] = $linedata;
                                        elseif ($linelabel == \"RC\")
                                                { // OPENS elseif ($linelabel == \"RC\")
                                                $rc_str .= $linedata;
                                                while ( list($no, $linestr) = each($flines) )
                                                        { // OPENS 3rd WHILE
                                                        $linelabel = left($linestr, 2);
                                                        $linedata = trim(substr($linestr, 5));
                                                        $lineend = right($linedata, 1);
                                                        if ($linelabel == \"RC\")
                                                                $rc_str .= \" $linedata\";                       
                                                        else
                                                                { // opens else
                                                                prev($flines);
                                                                break;
                                                                } // closes else
                                                        } // CLOSES 3rd WHILE

                                                // we remove the last character if it is \";\"
                                                $rc_str = trim($rc_str);
                                                if (right($rc_str,1) == \";\") $rc_str = rem_right($rc_str);
                                                $rc_line = preg_split(\"/;/\", trim($rc_str));
                                                array_walk($rc_line, \"trim_element\");
                                                $innermost = array();
                                                
                                                foreach($rc_line as $tokval_str)
                                                        {
                                                        // here we assume that there is no whitespace
                                                        // before or after (left or right of) the \"=\".
                                                        $tokval_r = preg_split(\"/=/\", $tokval_str);
                                                        $token = $tokval_r[0];
                                                        $value = $tokval_r[1];
                                                        $innermost[$token] = $value;
                                                        }
                                                $inner_r[\"RC\"] = $innermost; 
                                                } // CLOSES elseif ($linelabel == \"RC\")
                                        elseif ($linelabel == \"RM\")
                                                { // We have no idea what RM is about, so we assume it\'s a single-line entry.
                                                // which may occur 0 to 1 times inside a SWISSPROT SEQUENCE RECORD.
                                                $inner_r[\"RM\"] = $linedata;
                                                }
                                        elseif ($linelabel == \"RX\")
                                                {
                                                $linedata = rem_right($linedata);
                                                $rx_line = preg_split(\"/;/\", intrim($linedata));
                                                $inner_r[\"RX_BDN\"] = $rx_line[0];
                                                $inner_r[\"RX_ID\"] = $rx_line[1];
                                                }
                                        elseif ($linelabel == \"RA\")
                                                {
                                                $ra_ctr++;
                                                if ($ra_ctr == 1) $ra_str = $linedata;
                                                else $ra_str .= \" $linedata\";
                                                if ($lineend == \";\")
                                                        {
                                                        $ra_str = rem_right($ra_str);
                                                        $ra_r = preg_split(\"/\\,/\", $ra_str);
                                                        array_walk($ra_r, \"trim_element\");
                                                        $inner_r[\"RA\"] = $ra_r;                   
                                                        }
                                                }
                                        elseif ($linelabel == \"RL\")
                                                {
                                                $rl_ctr++;
                                                if ($rl_ctr == 1) $rl_str = $linedata;
                                                else $rl_str .= \" $linedata\";
                                                }
                                        else
                                                {
                                                $inner_r[\"RL\"] = $rl_str;
                                                prev($flines);
                                                break;
                                                }
                                        } // CLOSES 2nd WHILE

                                $ref_r[$refno-1] = $inner_r;
                                $ra_str = \"\";
                                $ra_ctr = 0;
                                $rl_str = \"\";
                                $rl_ctr = 0;
                                } // CLOSES \"RN\"

                        if (left($linestr, 2) == \"GN\")
                                { // OPENS if (left($linestr, 2) == \"GN\") 
                                // GN is always exactly one line.
                                // GNAME1 OR GNAME2               ( (GNAME1, GNAME2) )
                                // GNAME1 AND GNAME2              ( (GNAME1), (GNAME2) )
                                // GNAME1 AND (GNAME2 OR GNAME3)  ( (GNAME1), (GNAME2, GNAME3) )
                                // GNAME1 OR (GNAME2 AND GNAME3)  NOT POSSIBLE!!!
                                
                                /* ALGORITHM:
                                  1) Split expressions by \" AND \".
                                  2) Test each \"token\" if in between parentheses or not.
                                  3) If not, then token is a singleton, else it\'s a multiple-ton.
                                  4) Singletons are translated into (GNAME1).
                                          Multiple-tons are translated into (GNAME1, GNAME 2).
                                  5) Push gene name array into larger array. Go to next token.
                                */

                                // Remove \"GN \" at the beginning of our line.
                                $linestr = trim(substr($linestr, 5));
                                // Remove the last character which is always a period.
                                $linestr = substr($linestr, 0, strlen($linestr)-1); 

                                // Go here if you detect at least one ( or ). 
                                if ( is_false(strpos($linestr, \"(\")) )
                                        { // GN Line does not contain any parentheses.
                 // Ergo, it is made up of all OR\'s or AND\'s but not both.
                                        if (strpos($linestr, \" OR \") != FALSE)
                                                {
                                                // Case 1: GNAME1 OR GNAME2.
                                                $temp = preg_split(\"/ OR /\", $linestr);
                                                $gename_r[] = $temp;
                                                }
                                        elseif (strpos($linestr, \" AND \") != FALSE)
                                                {
                                                // Case 2: GNAME1 AND GNAME2 AND GNAME3.
                                                $temp = preg_split(\"/ AND /\", $linestr);
                                                foreach($temp as $gene)
                                                        $gename_r[] = array($gene);
                                                }
                                        else $gename_r[] = array($linestr);
                                        // Case 0: GN GENENAME1. One gene name (no OR, AND).
                                        }
                                else
                                        { // OPENS else part of if ( is_false(strpos($linestr, \"(\")) )  
                 // GN Line contains at least one pair of parentheses.
                 // Case 3: GNAME1 AND (GNAME2 OR GNAME3) => ( (GNAME1), (GNAME2, GNAME3) )
                 // COMMENTS # 1 below.
                                        $temp = preg_split(\"/ AND /\", $linestr);
                                        foreach($temp as $gene)
                                                { // OPENS foreach($temp as $gene) 
                                                if (substr($gene, 0, 1) == \"(\")
                                                        { // a list of 2 or more gene names OR\'ed together
                                                        // remove the \"(\" and \")\" at both ends of the string.
                                                        $gene = substr($gene, 1);
                                                        $gene = substr($gene, 0, strlen($gene)-1);
                                                        $genelist = preg_split(\"/ OR /\", $gene);
                                                        $gename_r[] = $genelist;
                                                        }
                                                else
                                                        { // singleton
                                                        $gename_r[] = array($gene);
                                                        }
                                                } // CLOSES foreach($temp as $gene)
                                        } // CLOSES else part of if ( is_false(strpos($linestr, \"(\")) )  
                                } // CLOSES if (left($linestr, 2) == \"GN\") 
                          
                                // 0123456789012345678901234567890123456789
                                // SQ   SEQUENCE XXXX AA; XXXXX MW; XXXXX CN;
                                if ($linelabel == \"SQ\")
                                        { // OPENS if ($linelabel == \"SQ\")
                                        $linedata = rem_right($linedata);
                                        // XXXX AA, XXXX MW, XXXX CN                                                                    
                                        $words = preg_split(\"/;/\", substr($linedata, 8));                                  
                                        $aa = preg_split(\"/\\s+/\", trim($words[0]));
                                        $aa_count = (int) trim($aa[0]);
                                        $mw = preg_split(\"/\\s+/\", trim($words[1]));
                                        $mol_wt = (int) trim($mw[0]);                                                                    
                                        $cn = preg_split(\"/\\s+/\", trim($words[2]));
                                        $chk_no = trim($cn[0]);
                                        $chk_method = trim($cn[1]); 
                                        $sequence = \"\";
                                        while ( list($no, $linestr) = each($flines) )
                                                { 
                                                $linelabel = left($linestr, 2);
                                                if ($linelabel == \"//\") break;
                                                $linedata = intrim(trim($linestr));
                                                $sequence .= $linedata;                                                                                             
                                                } 
                                        } // CLOSES if ($linelabel == \"SQ\")                          

        } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )                            
                
        $seqobj = new seq();                        
        $seqobj->id = $protein_name; 
        $seqobj->seqlength = $length;                                
        $seqobj->moltype = $moltype;
        $seqobj->date = $create_date;
        $seqobj->accession = $accession[0];
        array_shift($accession);
        $seqobj->sec_accession = $accession; 
        $seqobj->source = $os_line;
        $seqobj->organism = $oc_line;
        $seqobj->sequence = $sequence;                          
        $seqobj->definition = $desc;
        $seqobj->keywords = $kw_r;

        $genbank_ref_r = array();
        $inner_r = array();
        foreach($ref_r as $key => $value)
                {
                $inner_r[\"REFNO\"] = $key;                                                                
                $db_id = $value[\"RX_BDN\"];
                $inner_r[$db_id] = $value[\"RX_ID\"];                                                                
                $inner_r[\"REMARKS\"] = $value[\"RP\"];
                $inner_r[\"COMMENT\"] = $value[\"RC\"];                                                        
                $inner_r[\"TITLE\"] = $value[\"RL\"];
                $inner_r[\"JOURNAL\"] = $value[\"RL\"];          
                $inner_r[\"AUTHORS\"] = $value[\"RA\"];
                $genbank_ref_r[] = $inner_r;
                }
        $seqobj->reference = $genbank_ref_r;                                                        

        $swiss = array();
        $swiss[\"ID\"] = $protein_name;
        $swiss[\"PROT_NAME\"] = $protein_name;
        $swiss[\"MOL_TYPE\"] = $moltype;
        $swiss[\"PROT_SOURCE\"] = $protein_source;
        $swiss[\"DATA_CLASS\"] = $data_class;
        $swiss[\"LENGTH\"] = $length;                        
        $swiss[\"CREATE_DATE\"] = $create_date;  
        $swiss[\"CREATE_REL\"] = $create_rel; 
        $swiss[\"SEQUPD_DATE\"] = $sequpd_date;
        $swiss[\"SEQUPD_REL\"] = $sequpd_rel;
        $swiss[\"NOTUPD_DATE\"] = $notupd_date;
        $swiss[\"NOTUPD_REL\"] = $notupd_rel;
        // ACCESSION is an ARRAY.
        $swiss[\"ACCESSION\"] = $accession;
        $swiss[\"PRIM_AC\"] = $accession[0]; 
        $swiss[\"DESC\"] = $desc;
        $swiss[\"IS_FRAGMENT\"] = $is_fragment;
        // KEYWORDS is an ARRAY.
        $swiss[\"KEYWORDS\"] = $kw_r;
        // ORGANISM is an ARRAY.
        $swiss[\"ORGANISM\"] = $os_line;
        $swiss[\"ORGANELLE\"] = $organelle;
        // FT_<keyword> is an ARRAY.
        process_ft($swiss, $ft_r);                
        
        $swiss[\"AMINO_COUNT\"] = $aa_count;
        $swiss[\"MOLWT\"] = $mol_wt;
        $swiss[\"CHK_NO\"] = $chk_no;
        $swiss[\"CHK_METHOD\"] = $chk_method; 
        $swiss[\"SEQUENCE\"] = $sequence;        
        // GENE_NAME is an ARRAY.
        $swiss[\"GENE_NAME\"] = $gename_r;
        // ORG_CLASS is an ARRAY.                        
        $swiss[\"ORG_CLASS\"] = $oc_line;                        
        // REFERENCE is an ARRAY.
        $swiss[\"REFERENCE\"] = $ref_r;
        
        $seqobj->swissprot = $swiss;                // ARRAY          
        return $seqobj;                                                
        } // CLOSES parse_swissprot()

// parse_id() parses a GenBank data file and returns a Seq object containing parsed data.
function parse_id($flines)
        {
        $seqarr = array();
        $inseq_flag = false;
        $seqdata_flag = false;
        $accession_flag = false;
        $ref_array = array();
        $feature_array = array();
        $entry_ctr = 0;
        $ref_ctr = 0;
        $maxlength = 0;
        $minlength = 999999;
        $tot_seqlength = 0;
                
        while( list($lineno, $linestr) = each($flines) )
                { // OPENS outermost while( list($lineno, $linestr) = each($flines) )
      if (substr($linestr,0,5) == \"LOCUS\")
         {
         $entry_ctr++;
         $ref_ctr = 0;
         $ref_array = array();

         // This is the beginning of a SEQUENCE ENTRY.
         $seqdata = \"\";

         $seqobj = new seq();
         $seqobj->id = trim(substr($linestr, 12, 16));
         $seqobj->seqlength = trim(substr($linestr, 29, 11)) * 1;
         $tot_seqlength += $seqobj->seqlength;

         if ($seqobj->seqlength > $maxlength) $maxlength = $seqobj->seqlength;
         if ($seqobj->seqlength < $minlength) $minlength = $seqobj->seqlength;

         $seqobj->moltype = substr($linestr, 47, 6);
         if (substr($linestr, 44, 3) == \"ss-\") $seqobj->strands = \"SINGLE\";
         elseif (substr($linestr, 44, 3) == \"ds-\") $seqobj->strands = \"DOUBLE\";
         elseif (substr($linestr, 44, 3) == \"ms-\") $seqobj->strands = \"MIXED\";

         $seqobj->topology = strtoupper(substr($linestr, 55, 8));
         $seqobj->division = strtoupper(substr($linestr, 64, 3));
         $seqobj->date = strtoupper(substr($linestr, 68, 11));

         $inseq_flag = true;
         }

                if (trim(substr($linestr,0,10)) == \"BASE COUNT\")
         {
         if (count($feat_r) > 0)
            $seqobj->features = $feat_r;
         }
                        
      if (trim(substr($linestr,0,12)) == \"FEATURES\")
         { // OPENS if (trim(substr($linestr,0,12)) == \"FEATURES\") 
         // The REFERENCE section was present for this SEQUENCE ENTRY so we set REFERENCE attribute.
         if (count($ref_array) > 0) $seqobj->reference = $ref_array;
         $lastsubkey = \"\";

         $feat_r = array();
         $qual_r = array();

         // Go to the next line.
         list($lineno, $linestr) = each($flines);

         // This loops through each line in the entire FEATURES SECTION.
         while( substr($linestr,0,10) != \"BASE COUNT\" )
            { // FEATURES WHILE LOOP
            $label = trim(substr($linestr,0,21));
            $data = trim(substr($linestr,21));

            if (strlen($label) != 0)
               {
               // At the beginning of a new SUBKEY.
               $subkey = $label;
               // Add/save the qualifier array (qual_r) of the previous SUBKEY to our big feat_r array.
               if (count($qual_r) > 0)
                  {
                  $feat_r[$lastsubkey] = $qual_r;
                  $qual_r = array();
                  }

               $qual = $subkey;
               $qual_r[$qual] = \"\";
               $qual_ctr = 0;

               do
                  { // QUALIFIER WHILE LOOP
                  $qual_ctr++;
                  $qual_r[$qual] .= \" \" . $data;

                  list($lineno, $linestr) = each($flines);
                  $label = trim(substr($linestr,0,21));
                  $data = trim(substr($linestr,21));
                  } while( is_blank($label) and !(isa_qualifier($data)) );

               if (!(is_blank($label)))
                  {
                  $lastsubkey = $subkey;
                  $subkey = $label;
                  }
               }
            else
               { // we are inside a subkey section but on the 2nd, 3rd, nth line which have blank LABELS.
               if (isa_qualifier($data))
                  {
                  $wordarray = preg_split(\"/=/\", $data);
                  $qual = $wordarray[0];
                  $data = $wordarray[1];
                  $qual_r[$qual] = \"\";
                  $qual_ctr = 0;

                  do
                     { // QUALIFIER WHILE LOOP
                     $qual_ctr++;
                     $qual_r[$qual] .= \" \" . $data;
                     list($lineno, $linestr) = each($flines);
                     $label = trim(substr($linestr,0,21));
                     $data = trim(substr($linestr,21));
                     } while( is_blank($label) and !(isa_qualifier($data)) );

                  if (!(is_blank($label)))
                     {
                     $lastsubkey = $subkey;
                     $subkey = $label;
                     }
                  }
               } // ELSE PART of if (strlen($subkey) != 0)
            } // FEATURES WHILE LOOP

            if (count($qual_r) > 0)
               {
               $feat_r[$lastsubkey] = $qual_r;
               $qual_r = array();
               }

            prev($flines);
         } // CLOSES if (trim(substr($linestr,0,12)) == \"FEATURES\") 
                        
      if (substr($linestr,0,10) == \"DEFINITION\")
         {
         $wordarray = explode(\" \", $linestr);
         array_shift($wordarray);
         $seqobj->definition = implode(\" \", $wordarray);
         }
                                        
      if ($inseq_flag == TRUE)
         { // OPENS if ($inseq_flag == TRUE) 
         if (trim(substr($linestr, 0, 12)) == \"REFERENCE\")
            {
            // at this point, we are at the line with REFERENCE x (base y of z) in it.
            $wordarray = preg_split(\"/\\s+/\", trim(substr($linestr,12)));

            $ref_rec = array();
            $ref_rec[\"REFNO\"] = $wordarray[0];
            array_shift($wordarray);
            $ref_rec[\"BASERANGE\"] = implode(\" \", $wordarray);
            $lastsubkey = \"\";
            $subkey_lnctr = 0;

            while( list($lineno, $linestr) = each($flines) )
               {
               $subkey = trim(substr($linestr,0,12));

               // If current subkey is blank string, then this is a continuation of the last subsection.
               if (strlen($subkey) == 0) $subkey = $lastsubkey;

               // If we are at the next subkey section (e.g. lastsubkey was AUTHORS, and current is TITLE).
               if ($subkey != $lastsubkey) $subkey_lnctr = 0;

               switch ($subkey)
                  {
                  case \"AUTHORS\":
                     $subkey_lnctr++;
                     $wordarray = preg_split(\"/\\s+/\", trim(substr($linestr,12)));
                     // we remove comma at the end of a name, and the element \"and\".
                     $newarray = array();
                     foreach($wordarray as $authname)
                        {
                        if (strtoupper($authname) != \"AND\")
                           {
                           if (substr($authname, strlen($authname)-1, 1) == \",\")
                              $authname = substr($authname, 0, strlen($authname)-1);
                           $newarray[] = $authname;
                           }
                        }
                     if ($subkey_lnctr == 1) $ref_rec[\"AUTHORS\"] = $newarray;
                     else $ref_rec[\"AUTHORS\"] = array_merge($ref_rec[\"AUTHORS\"], $newarray);
                     break;
                  case \"TITLE\":
                     $subkey_lnctr++;
                     if ($subkey_lnctr == 1) $ref_rec[\"TITLE\"] = trim(substr($linestr,12));
                     else $ref_rec[\"TITLE\"] .= \" \" . trim(substr($linestr,12));
                     break;
                  case \"JOURNAL\":
                     $subkey_lnctr++;
                     if ($subkey_lnctr == 1) $ref_rec[\"JOURNAL\"] = trim(substr($linestr,12));
                     else $ref_rec[\"JOURNAL\"] .= \" \" . trim(substr($linestr,12));
                     break;
                  case \"MEDLINE\":
                     $ref_rec[\"MEDLINE\"] = substr($linestr, 12, 8);
                     break;
                  case \"PUBMED\":
                     $ref_rec[\"PUBMED\"] = substr($linestr, 12, 8);
                     break;
                  case \"REMARK\":
                     $subkey_lnctr++;
                     if ($subkey_lnctr == 1) $ref_rec[\"REMARK\"] = trim(substr($linestr,12));
                     else $ref_rec[\"REMARK\"] .= \" \" . trim(substr($linestr,12));
                     break;
                  case \"COMMENT\":
                     $subkey_lnctr++;
                     if ($subkey_lnctr == 1) $ref_rec[\"COMMENT\"] = trim(substr($linestr,12));
                     else $ref_rec[\"COMMENT\"] .= \" \" . trim(substr($linestr,12));
                     break;
                  }
               if ($subkey == \"FEATURES\")
                  {
                  prev($flines);
                  break;
                  }
               if ($subkey == \"REFERENCE\")
                  {
                  $ref_ctr++;
                  prev($flines);
                  break;
                  }
               $lastsubkey = $subkey;
               }
            array_push($ref_array, $ref_rec);
            }
         if (trim(substr($linestr, 0, 12)) == \"SOURCE\")
            {
            // For now, assume a single-line SOURCE field.
            $seqobj->source = substr($linestr, 12);
            }
         if (trim(substr($linestr, 0, 12)) == \"SEGMENT\")
            {
            $seqobj->segment = substr($linestr, 12);
            $wordarray = preg_split(\"/\\s+/\", trim(substr($linestr,12)));
            $seqobj->segment_no = $wordarray[0];
            $seqobj->segment_count = $wordarray[2];
            }

         // For now, assume that KEYWORDS field consists of exactly one line.
         if (trim(substr($linestr, 0, 12)) == \"KEYWORDS\")
            {
            $wordarray = preg_split(\"/\\s+/\", trim($linestr));
            array_shift($wordarray);
            $wordarray = preg_split(\"/;+/\", implode(\" \", $wordarray));
            if ($wordarray[0] != \".\") $seqobj->keywords = $wordarray;
            }
         if (substr($linestr, 0, 7) == \"VERSION\")
            {
            // Assume that VERSION line is made up of exactly 2 or 3 tokens.
            $wordarray = preg_split(\"/\\s+/\", trim($linestr));
            $seqobj->version = $wordarray[1];
            if (count($wordarray) == 3) $seqobj->ncbi_gi_id = $wordarray[2];
            $accession_flag = false;
            }
         if ($accession_flag == TRUE)
            {
            // 2nd, 3rd, etc. line of ACCESSION field.
            $wordarray = preg_split(\"/\\s+/\", trim($linestr));
            $this->sec_accession = array_merge($this->sec_accession, $wordarray);
            }
         if (substr($linestr,0,9) == \"ACCESSION\")
            {
            $wordarray = preg_split(\"/\\s+/\", trim($linestr));
            $seqobj->accession = $wordarray[1];
            array_shift($wordarray);
            array_shift($wordarray);
            $seqobj->sec_accession = $wordarray;
            $accession_flag = true;
            }
         if (substr($linestr,0,10) == \"  ORGANISM\")
            {
            $seqobj->organism = substr($linestr,12);
            }
         if (($seqdata_flag == true) && (substr($linestr,0,2) != \"//\"))
            {
            $wordarray = explode(\" \", trim($linestr));
            array_shift($wordarray);
            $seqline = implode(\"\", $wordarray);
            $seqdata .= $seqline;
            }
         if (substr($linestr,0,6) == \"ORIGIN\") $seqdata_flag = true;
         if (substr($linestr,0,2) == \"//\")
            {
            $seqobj->sequence = $seqdata;
            $seqarr[$this->id] = $this;
            $seqdata_flag = false;
            $inseq_flag = false;
            break;
            }
         } // CLOSES if ($inseq_flag == TRUE) 
      } // CLOSES outermost while( list($lineno, $linestr) = each($flines) )
        $seqobj->seqarray = $seqarr;
        return $seqobj;
        } // Closes parse_id() constructor function definition

// open() opens or prepares the SeqDB for processing.  Opposite of close().
function open($dbname)
        {
        if (file_exists($dbname . \".idx\") == FALSE) die(\"ERROR: Index file $dbname.IDX does not exist!\");
        if (file_exists($dbname . \".dir\") == FALSE) die(\"ERROR: Index file $dbname.DIR does not exist!\");

        $this->dbname = $dbname;
        $this->data_fn = $dbname . \".idx\";
        $this->dir_fn = $dbname . \".dir\";
        $this->seqptr = 0;
        }

// close() closes the SeqDB database after we\'re through using it.  Opposite of open() method.  
function close()
        { //
        // Close simply assigns null values to attributes of the seqdb() object.
        // Methods like fetch would not function properly if these values are null.
        $this->dbname = \"\";
        $this->data_fn = \"\";
        $this->dir_fn = \"\";
        $this->seqptr = -1;
        }

/*
SeqDB() is the constructor method for the SeqDB class.  It does many things like create
and/or read a database\'s index files, initialize certain SeqDB properties, etc.
Syntax: $seqdb = new seqdb($dbname, $dbformat, $file1, $file2, ...);
Behavior: if $dbname exists and user gave no specific values for $file1, $file2, ...
             then seqdb() object USES/OPENS existing database (index files).
          if $dbname exists and user gave specific values for $file1, $file2, ...
             then seqdb() object OVERWRITES existing database (index files).
          if $dbname does not exist, then seqdb() object CREATES new database.
             even if $file1, $file2, ... are not specified.
We provide the create() method to explicitly create a new database.
We provide the use() or open() method to explicitly use an existing database.
*/
function SeqDB()
        {
        // Get all the arguments passed to this function.
        $args = func_get_args();                
        $dbname = $args[0];
        $dbformat = strtoupper($args[1]);                
        if (strlen($dbformat) == 0) $dbformat = \"GENBANK\";                
        $this->dbformat = $dbformat;

        $datafile = array();
        for($i = 2; $i < count($args); $i++)
                $datafile[] = $args[$i];

        /* db exists   fileX args   ACTION   TESTED
                        Y            Y        create   okay
                        Y            N        use
                        N            Y        create    okay
                        N            N        create    okay
        */
        // if user provided specific values for $file1, $file2, ... parameters.
        if ((file_exists($dbname)) and (count($datafile) > 0))
                {
                // For now, assume USING/OPENING a database is to be done in READ ONLY MODE.
                $this->open($dbname);
                }
        else
                {
                // March 26, 2003: I switched the 2 lines below with
                // the line: $this->open($dbname); to avoid die-ing with
                // the error message: \"ERROR: Index file does not exist!\"
                $fp = fopen($dbname . \".idx\", \"w+\");
                $fpdir = fopen($dbname . \".dir\", \"w+\");

                // Creates blank data and directory index files, and sets seqptr to 0, etc.
                $this->open($dbname);

                // if user did not provide any datafile name.
                if (count($datafile) == 0) return;

                $temp_r = array();
                // Build our *.DIR file
                foreach($datafile as $fileno=>$filename)
                        {
                        $outline = \"$fileno $filename\\n\";
                        fputs($fpdir, $outline);

                        // Automatically create an index file containing info across all data files.
                        $flines = file($filename);
                        $totlines = count($flines);

                        while( list($lineno, $linestr) = each($flines) )
                          {
                          // if (substr($linestr,0,5) == \"LOCUS\")
                          if (at_entrystart($linestr, $dbformat))
                                  {
                                  // $current_id = trim(substr($linestr, 12, 16));
                                  $current_id =  get_entryid($flines, $linestr, $dbformat);
                                  $outline = \"$current_id $fileno $lineno\\n\";
                                  // Put entries in an array first, sort them, then write to *.IDX file.
                                  // temp_r = (\"AB1234\" => (\"AB1234\", 1, 12), \"BC4321\" => ... );
                                  $temp_r[$current_id] = array($current_id, $fileno, $lineno);
                                  }
                                }
                        // Sort our array by its keys.
                        ksort($temp_r);
                        }
                // Build our *.IDX array.
                $this->seqcount = count($temp_r);
                foreach($temp_r as $seqid=>$line_r)
                        {
                        $outline = $line_r[0] . \" \" . $line_r[1] . \" \" . $line_r[2] . \"\\n\";
                        $fio = fputs($fp, $outline);
                        }
                }
        fclose($fp);
        fclose($fpdir);
        } // CLOSES definition of SeqDB constructor function.
        
} // CLOSES definition of SEQDB CLASS.
?>