public TextParser(BasicTokenizer t, IList<int> seq_container = null) : base(t) { this.Voc = new Dictionary<string, int> (); //this.InvIndex = new List<IList<int>> (); if (seq_container == null) { seq_container = new List<int>(); } this.Seq = seq_container; }
public void Build(string path, int _num_cols, BasicTokenizer tokenizer, SequenceBuilder seq_builder) { Console.WriteLine ("*** building Table: '{0}', with {1} columns", path, _num_cols); var input = new StreamReader (File.OpenRead (path)); var C = new ColumnBuilder[ _num_cols ]; int numcol = 0; int numrec = 0; var recsep = tokenizer.RecordSeparator.ToString(); for (int i = 0; i < _num_cols; ++i) { C [i] = new ColumnBuilder (); C [i].Add(recsep); } foreach (var p in tokenizer.Parse(input, false)) { // Console.WriteLine("<{0}>", p.Data); if (p.DataType == TokenType.FieldSeparator) { C[numcol].Add(recsep); ++numcol; continue; } if (p.DataType == TokenType.RecordSeparator) { if (numrec % 10000 == 0) { Console.WriteLine("-- record: {0}, date-time: {1}", numrec, DateTime.Now); } while (numcol < _num_cols) { C[numcol].Add(recsep); // C[numcol].Add(""); ++numcol; } ++numrec; numcol = 0; continue; } //if (p.DataType == TokenType.Data) { C[numcol].Add(p.Data); //} //Console.WriteLine ("===> type: {0}, data: '{1}'", p.DataType, p.Data); } this.InputTokenizer = tokenizer; this.Columns = new Column[_num_cols]; for (int i = 0; i < _num_cols; ++i) { Console.WriteLine ("*** compressing column-{0} of '{1}'", i, path); C[i].Add (recsep); this.Columns[i] = C[i].Finish(recsep, seq_builder); } }
public virtual void Build(IEnumerable<string> list, SequenceBuilder seq_builder, IList<int> seq_container) { this.FileNames = new List<string> (); int docid = 0; this.InputTokenizer = new BasicTokenizer('\0', '\0', '\0'); var parser = new TextParser(this.InputTokenizer, seq_container); foreach (var filename in list) { this.FileNames.Add (filename); parser.AddPlainString(parser.GetFileSeparator()); parser.Parse (File.ReadAllText (filename)); if (docid % 500 == 0) { Console.WriteLine ("== reviewing docid {0}, date-time: {1}", docid, DateTime.Now); } ++docid; } this.Voc = MapVocSeq.SortingVoc (parser.Voc, parser.Seq); this.Seq = seq_builder (parser.Seq, this.Voc.Count); this.sep_symbol = this.RankVoc (parser.GetFileSeparator ()); }
public static void Main(string[] args) { var outname = "db.test"; if (!File.Exists(outname)) { BasicTokenizer tokenizer = new BasicTokenizer ('\t', '\n', (char)0x0); //Tokenizer tokenizer = new Tokenizer ('/', '\n', (char)0x0); Table table = new Table (); table.Build (args [0], int.Parse (args [1]), tokenizer, SequenceBuilders.GetSeqXLB_DiffSet64 (16, 31)); using (var Output = new BinaryWriter(File.Create(outname))) { table.Save (Output); } } { Table table = new Table(); using (var Input = new BinaryReader(File.OpenRead(outname))) { table.Load(Input); for (int i = 0; i < 3; ++i) { var s = table.GetTextRecord(new StringBuilder(), i).ToString(); Console.WriteLine("=== record {0}: {1}", i, s); } } } }
public void Load(BinaryReader Input) { this.InputTokenizer = new BasicTokenizer(); this.InputTokenizer.Load(Input); int len = Input.ReadInt32(); this.Columns = new Column[len]; for (int i = 0; i < len; ++i) { this.Columns[i] = new Column(); this.Columns[i].Load(Input); } }
public BasicParser(BasicTokenizer t) { this.InputTokenizer = t; }
public virtual void Load(string basename) { using (var input = new BinaryReader(File.OpenRead(basename))) { this.InputTokenizer = new BasicTokenizer(); this.InputTokenizer.Load(input); } this.FileNames = File.ReadAllLines (basename + ".names"); using (var input = new BinaryReader(File.OpenRead(basename + ".seq"))) { this.Seq = GenericIO<Sequence>.Load (input); } using (var input = new BinaryReader(File.OpenRead(basename + ".voc"))) { var size = input.ReadInt32 (); this.Voc = new string[size]; for (int i = 0; i < size; ++i) { this.Voc [i] = input.ReadString (); } } this.sep_symbol = this.RankVoc (this.InputTokenizer.RecordSeparator.ToString()); }
public QueryParser(BasicTokenizer tokenizer) : base(tokenizer) { this.Query = new List<string> (); }