public async Task buildIndex(StreamReader input) { string line; long pos = 0, lineNum = 0; long bytePos = 0, lastBytePos = 0; string id; Stopwatch startWatch = new Stopwatch(), loopWatch = new Stopwatch(); startWatch.Start(); loopWatch.Start(); BulkRequest req = new BulkRequest(); while (!input.EndOfStream) { line = input.ReadLine(); lastBytePos = bytePos; bytePos += line.Length; id = Regex.Match(line, regPattern).Value; id = id.Trim("\"id\":\"".ToCharArray()); //await clientIndex.Entities.PostAsync(new Index { _id = id, line = lineNum, position = pos}); var index = new Index { _id = id, line = lineNum, position = pos }; string s = JsonConvert.SerializeObject(index); req.Include(s); //Console.WriteLine(line); Console.WriteLine("ID: {0}\n line number: {1}\nbyte pos: {2}\nlast byte pos: {3}\nbyte difference: {4}\nline size: {5}", id, lineNum, bytePos, lastBytePos, bytePos - lastBytePos, line.Length); if (lineNum % 10000 == 0) { await clientIndex.Documents.BulkAsync(req); req = new BulkRequest(); //Console.Clear(); //Console.WriteLine("Stats\nProcessed: {0}", lineNum); //Console.WriteLine("Loop Time: {0}", loopWatch.Elapsed); //Console.WriteLine("Total Time: {0}", startWatch.Elapsed); loopWatch.Restart(); } lineNum++; pos += line.Length + 1; //+1 for newline char } }
public async Task buildCorpusIndex(FileStream input) { long i = 0, line = 0, linePos = 0; int bit = 0; string indexStr, id; BulkRequest req = new BulkRequest(); Stopwatch startWatch = new Stopwatch(), loopWatch = new Stopwatch(); startWatch.Start(); loopWatch.Start(); while (bit > -1) { StringBuilder sb = new StringBuilder(); linePos = i; while ((bit = input.ReadByte()) != '\n' && bit > -1) { i++; sb.Append((char)bit); } //+1 for new line bit i++; indexStr = sb.ToString(); id = Regex.Match(indexStr, regPattern).Value; id = id.Trim("\"id\":\"".ToCharArray()); var index = new Index { _id = id, line = line, position = linePos }; //Console.WriteLine(index.ToString()); string s = JsonConvert.SerializeObject(index); req.Include(s); line++; //Store and print stats every 1k lines if (line % 100000 == 0) { await clientIndex.Documents.BulkAsync(req); req = new BulkRequest(); Console.Clear(); Console.WriteLine("Stats\nProcessed: {0}", line); Console.WriteLine("Loop Time: {0}", loopWatch.Elapsed); Console.WriteLine("Total Time: {0}", startWatch.Elapsed); loopWatch.Restart(); } } }