Exemplo n.º 1
0
 public TextParser(BasicTokenizer t, IList<int> seq_container = null)
     : base(t)
 {
     this.Voc = new Dictionary<string, int> ();
     //this.InvIndex = new List<IList<int>> ();
     if (seq_container == null) {
         seq_container = new List<int>();
     }
     this.Seq = seq_container;
 }
Exemplo n.º 2
0
        public void Build(string path, int _num_cols, BasicTokenizer tokenizer, SequenceBuilder seq_builder)
        {
            Console.WriteLine ("*** building Table: '{0}', with {1} columns", path, _num_cols);
            var input = new StreamReader (File.OpenRead (path));
            var C = new ColumnBuilder[ _num_cols ];
            int numcol = 0;
            int numrec = 0;
            var recsep = tokenizer.RecordSeparator.ToString();

            for (int i = 0; i < _num_cols; ++i) {
                C [i] = new ColumnBuilder ();
                C [i].Add(recsep);
            }
            foreach (var p in tokenizer.Parse(input, false)) {
                // Console.WriteLine("<{0}>", p.Data);
                if (p.DataType == TokenType.FieldSeparator) {
                    C[numcol].Add(recsep);
                    ++numcol;
                    continue;
                }
                if (p.DataType == TokenType.RecordSeparator) {
                    if (numrec % 10000 == 0) {
                        Console.WriteLine("-- record: {0}, date-time: {1}", numrec, DateTime.Now);
                    }
                    while (numcol < _num_cols) {
                        C[numcol].Add(recsep);
                        // C[numcol].Add("");
                        ++numcol;
                    }
                    ++numrec;
                    numcol = 0;
                    continue;
                }
                //if (p.DataType == TokenType.Data) {
                C[numcol].Add(p.Data);
                //}
                //Console.WriteLine ("===> type: {0}, data: '{1}'", p.DataType, p.Data);
            }
            this.InputTokenizer = tokenizer;
            this.Columns = new Column[_num_cols];
            for (int i = 0; i < _num_cols; ++i) {
                Console.WriteLine ("*** compressing column-{0} of '{1}'", i, path);
                C[i].Add (recsep);
                this.Columns[i] = C[i].Finish(recsep, seq_builder);
            }
        }
Exemplo n.º 3
0
 public virtual void Build(IEnumerable<string> list, SequenceBuilder seq_builder, IList<int> seq_container)
 {
     this.FileNames = new List<string> ();
     int docid = 0;
     this.InputTokenizer = new BasicTokenizer('\0', '\0', '\0');
     var parser = new TextParser(this.InputTokenizer, seq_container);
     foreach (var filename in list) {
         this.FileNames.Add (filename);
         parser.AddPlainString(parser.GetFileSeparator());
         parser.Parse (File.ReadAllText (filename));
         if (docid % 500 == 0) {
             Console.WriteLine ("== reviewing docid {0}, date-time: {1}", docid, DateTime.Now);
         }
         ++docid;
     }
     this.Voc = MapVocSeq.SortingVoc (parser.Voc, parser.Seq);
     this.Seq = seq_builder (parser.Seq, this.Voc.Count);
     this.sep_symbol = this.RankVoc (parser.GetFileSeparator ());
 }
Exemplo n.º 4
0
        public static void Main(string[] args)
        {
            var outname = "db.test";
            if (!File.Exists(outname)) {
                BasicTokenizer tokenizer = new BasicTokenizer ('\t', '\n', (char)0x0);
                //Tokenizer tokenizer = new Tokenizer ('/', '\n', (char)0x0);
                Table table = new Table ();
                table.Build (args [0], int.Parse (args [1]), tokenizer, SequenceBuilders.GetSeqXLB_DiffSet64 (16, 31));
                using (var Output = new BinaryWriter(File.Create(outname))) {
                    table.Save (Output);
                }
            }
            {
                Table table = new Table();
                using (var Input = new BinaryReader(File.OpenRead(outname))) {
                    table.Load(Input);

                    for (int i = 0; i < 3; ++i) {
                        var s = table.GetTextRecord(new StringBuilder(), i).ToString();
                        Console.WriteLine("=== record {0}: {1}", i, s);
                    }
                }
            }
        }
Exemplo n.º 5
0
 public void Load(BinaryReader Input)
 {
     this.InputTokenizer = new BasicTokenizer();
     this.InputTokenizer.Load(Input);
     int len = Input.ReadInt32();
     this.Columns = new Column[len];
     for (int i = 0; i < len; ++i) {
         this.Columns[i] = new Column();
         this.Columns[i].Load(Input);
     }
 }
Exemplo n.º 6
0
 public BasicParser(BasicTokenizer t)
 {
     this.InputTokenizer = t;
 }
Exemplo n.º 7
0
 public virtual void Load(string basename)
 {
     using (var input = new BinaryReader(File.OpenRead(basename))) {
         this.InputTokenizer = new BasicTokenizer();
         this.InputTokenizer.Load(input);
     }
     this.FileNames = File.ReadAllLines (basename + ".names");
     using (var input = new BinaryReader(File.OpenRead(basename + ".seq"))) {
         this.Seq = GenericIO<Sequence>.Load (input);
     }
     using (var input = new BinaryReader(File.OpenRead(basename + ".voc"))) {
         var size = input.ReadInt32 ();
         this.Voc = new string[size];
         for (int i = 0; i < size; ++i) {
             this.Voc [i] = input.ReadString ();
         }
     }
     this.sep_symbol = this.RankVoc (this.InputTokenizer.RecordSeparator.ToString());
 }
Exemplo n.º 8
0
 public QueryParser(BasicTokenizer tokenizer)
     : base(tokenizer)
 {
     this.Query = new List<string> ();
 }