Class Bio::FastaDefline
In: lib/bio/db/fasta.rb  (CVS)
Parent: Object

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or ":"-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

  rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
  rub.entry_id       ==> 'gi|671595'
  rub.get('emb')     ==> 'CAA85678.1'
  rub.emb            ==> 'CAA85678.1'
  rub.gi             ==> '671595'
  rub.accession      ==> 'CAA85678'
  rub.accessions     ==> [ 'CAA85678' ]
  rub.acc_version    ==> 'CAA85678.1'
  rub.locus          ==> nil
  rub.list_ids       ==> [["gi", "671595"],
                          ["emb", "CAA85678.1", nil],
                          ["Perovskia abrotanoides"]]

  ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
  ckr.entry_id      ==> "gi|2495000"
  ckr.sp            ==> "CCKR_CAVPO"
  ckr.pir           ==> "I51898"
  ckr.gb            ==> "AAB29504.1"
  ckr.gi            ==> "2495000"
  ckr.accession     ==> "AAB29504"
  ckr.accessions    ==> ["Q63931", "AAB29504"]
  ckr.acc_version   ==> "AAB29504.1"
  ckr.locus         ==> nil
  ckr.description   ==>
    "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
  ckr.descriptions  ==>
    ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
     "cholecystokinin A receptor - guinea pig",
     "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
  ckr.words         ==>
    ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
     "receptor", "type"]
  ckr.id_strings    ==>
    ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
     "544724", "AAB29504.1", "Cavia"]
  ckr.list_ids      ==>
    [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
     ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
     ["gb", "AAB29504.1", nil], ["Cavia"]]

Refereneces

Methods

Constants

NSIDs = { # NCBI and WU-BLAST 'gi' => [ 'gi' ], # NCBI GI 'gb' => [ 'acc_version', 'locus' ], # GenBank 'emb' => [ 'acc_version', 'locus' ], # EMBL 'dbj' => [ 'acc_version', 'locus' ], # DDBJ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT 'pdb' => [ 'entry_id', 'chain' ], # PDB 'bbs' => [ 'number' ], # GenInfo Backbone Id 'gnl' => [ 'database' , 'entry_id' ], # General database identifier 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence 'lcl' => [ 'entry_id' ], # Local Sequence identifier # WU-BLAST and NCBI 'pir' => [ 'accession', 'entry_id' ], # PIR 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation 'pat' => [ 'country', 'number', 'serial' ], # Patents # WU-BLAST only 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier 'gim' => [ 'number' ], # NCBI GenInfo Import identifier 'gp' => [ 'acc_version', 'locus' ], # GenPept 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank # Original 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB }
KillWords = [ 'an', 'the', 'this', 'that', 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might', 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with', 'from', 'and', 'or', 'not', 'dna', 'rna', 'mrna', 'cdna', 'orf', 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp', 'similar', 'involved', 'identical', 'identity', 'cds', 'clone', 'library', 'contig', 'contigs', 'homolog', 'homologue', 'homologs', 'homologous', 'protein', 'proteins', 'gene', 'genes', 'product', 'products', 'sequence', 'sequences', 'strain', 'strains', 'region', 'regions', ]
KillWordsHash = {}
KillRegexpArray = [ /\A\d{1,3}\%?\z/, /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/, /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/

Attributes

entry_id  [R]  Shows a possibly unique identifier. Returns a string.
list_ids  [R]  Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.

Public Class methods

Parses given string.

[Source]

# File lib/bio/db/fasta.rb, line 469
    def initialize(str)
      @deflines = []
      @info = {}
      @list_ids = []

      @entry_id = nil

      lines = str.split("\x01")
      lines.each do |line|
        add_defline(line)
      end
    end

Public Instance methods

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

# File lib/bio/db/fasta.rb, line 782
    def acc_version
      unless defined?(@acc_version) then
        @acc_version = get_by_type('acc_version')
      end
      @acc_version
    end

Shows an accession number.

[Source]

# File lib/bio/db/fasta.rb, line 800
    def accession
      unless defined?(@accession) then
        if acc_version then
          @accession = acc_version.split('.')[0]
        else
          @accession = accessions[0]
        end
      end
      @accession
    end

Shows accession numbers. Returns an array of strings.

[Source]

# File lib/bio/db/fasta.rb, line 791
    def accessions
      unless defined?(@accessions) then
        @accessions = get_all_by_type('accession', 'acc_version')
        @accessions.collect! { |x| x.sub(/\..*\z/, '') }
      end
      @accessions
    end

Parses given string and adds parsed data.

[Source]

# File lib/bio/db/fasta.rb, line 483
    def add_defline(str)
      case str
      when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
        # NSIDs
        # examples:
        # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
        #
        # note: regexp (:?) means grouping without backreferences
        i = $1
        d = $2
        tks = i.split('|')
        tks << '' if i[-1,1] == '|'
        a = parse_NSIDs(tks)
        i = a[0].join('|')
        a.unshift('|')
        d = tks.join('|') + ' ' + d unless tks.empty?
        a << d
        this_line = a
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /\A[A-Z]/ then
            di = [  x ]
            @list_ids << di
            @info['organism'] = x unless @info['organism']
          end
        end

      when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
        # examples:
        # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
        # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
        i = $1
        d = $2
        a = parse_ColonSepID(i)
        i = a.join(':')
        this_line = [ ':', a , d ]
        match_EC(d)
        parse_square_brackets(d).each do |x|
          if !match_EC(x, false) and x =~ /:/ then
            parse_ColonSepID(x)
          elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
            @list_ids << [ $1 ]
          end
        end

      when /^\>?\s*(\S+)(?:\s+(.+))?$/
        # examples:
        # >ABC12345 this is test
        i = $1
        d = $2.to_s
        @list_ids << [ i.chomp('.') ]
        this_line = [  '', [ i ], d ]
        match_EC(d)
      else
        i = str
        d = ''
        match_EC(i)
        this_line = [ '', [ i ], d ]
      end

      @deflines << this_line
      @entry_id = i unless @entry_id
    end

Shows description.

[Source]

# File lib/bio/db/fasta.rb, line 625
    def description
      @deflines[0].to_a[-1]
    end

Returns descriptions.

[Source]

# File lib/bio/db/fasta.rb, line 630
    def descriptions
      @deflines.collect do |a|
        a[-1]
      end
    end

Returns identifires by a database name.

[Source]

# File lib/bio/db/fasta.rb, line 706
    def get(dbname)
      db = dbname.to_s
      r = nil
      unless r = @info[db] then
        di = @list_ids.find { |x| x[0] == db.to_s }
        if di and di.size <= 2 then
          r = di[-1]
        elsif di then
          labels = self.class::NSIDs[db]
          [ 'acc_version', 'entry_id',
            'locus', 'accession', 'number'].each do |x|
            if i = labels.index(x) then
              r = di[i+1]
              break if r
            end
          end
          r = di[1..-1].find { |x| x } unless r
        end
        @info[db] = r if r
      end
      r
    end

Returns identifiers by given type.

[Source]

# File lib/bio/db/fasta.rb, line 742
    def get_all_by_type(*type_strarg)
      d = []
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          type_strarg.each do |y|
            if i = labels.index(y) then
              d << x[i+1] if x[i+1]
            end
          end
        end
      end
      d
    end

Returns an identifier by given type.

[Source]

# File lib/bio/db/fasta.rb, line 730
    def get_by_type(type_str)
      @list_ids.each do |x|
        if labels = self.class::NSIDs[x[0]] then
          if i = labels.index(type_str) then
            return x[i+1]
          end
        end
      end
      nil
    end

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

# File lib/bio/db/fasta.rb, line 771
    def gi
      unless defined?(@gi) then
        @gi = get_by_type('gi')
      end
      @gi
    end

Shows ID-like strings. Returns an array of strings.

[Source]

# File lib/bio/db/fasta.rb, line 638
    def id_strings
      r = []
      @list_ids.each do |a|
        if a.size >= 2 then
          r.concat a[1..-1].find_all { |x| x }
        else
          if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
            r << a[0]
          end
        end
      end
      r.concat( words(true, []).find_all do |x|
                 x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
                   x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
               end)
      r
    end

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

# File lib/bio/db/fasta.rb, line 760
    def locus
      unless defined?(@locus)
        @locus = get_by_type('locus')
      end
      @locus
    end

[Source]

# File lib/bio/db/fasta.rb, line 811
    def method_missing(name, *args)
      # raise ArgumentError,
      # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
      r = get(name, *args)
      if !r and !(self.class::NSIDs[name.to_s]) then
        raise "NameError: undefined method `#{name.inspect}'"
      end
      r
    end

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.

[Source]

# File lib/bio/db/fasta.rb, line 617
    def to_s
      @deflines.collect { |a|
        s = a[0]
        (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
      }.join("\x01")
    end

Shows words used in the defline. Returns an Array.

[Source]

# File lib/bio/db/fasta.rb, line 680
    def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
              kwhash = self.class::KillWordsHash)
      a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
      a.collect! do |x|
        x.sub!(/\A[\$\*\-\+]+/, '')
        x.sub!(/[\$\*\-\=]+\z/, '')
        if x.size <= 1 then
          nil
        elsif kwhash[x.downcase] then
          nil
        else
          if kill_regexp.find { |expr| expr =~ x } then
            nil
          else
            x
          end
        end
      end
      a.compact!
      a.collect! { |x| x.downcase } unless case_sensitive
      a.sort!
      a.uniq!
      a
    end

[Validate]