Esempio n. 1
0
		public async Task buildIndex(StreamReader input) {
			
			string line;
			long pos = 0, lineNum = 0;
			long bytePos = 0, lastBytePos = 0;
			string id;
			Stopwatch startWatch = new Stopwatch(), loopWatch = new Stopwatch();
			startWatch.Start();
			loopWatch.Start();

			BulkRequest req = new BulkRequest();

			while (!input.EndOfStream) {

				line = input.ReadLine();
				lastBytePos = bytePos;
				bytePos += line.Length;


				id = Regex.Match(line, regPattern).Value;
				id = id.Trim("\"id\":\"".ToCharArray());

				//await clientIndex.Entities.PostAsync(new Index { _id = id, line = lineNum, position = pos});

				var index = new Index { _id = id, line = lineNum, position = pos };
				string s = JsonConvert.SerializeObject(index);
				req.Include(s);

				//Console.WriteLine(line);
				Console.WriteLine("ID: {0}\n line number: {1}\nbyte pos: {2}\nlast byte pos: {3}\nbyte difference: {4}\nline size: {5}", 
									id, lineNum, bytePos, lastBytePos, bytePos - lastBytePos, line.Length);


				if (lineNum % 10000 == 0) {
					await clientIndex.Documents.BulkAsync(req);
					req = new BulkRequest();

					//Console.Clear();
					//Console.WriteLine("Stats\nProcessed: {0}", lineNum);
					//Console.WriteLine("Loop Time: {0}", loopWatch.Elapsed);
					//Console.WriteLine("Total Time: {0}", startWatch.Elapsed);
					
					loopWatch.Restart();
				}

				lineNum++;
				pos += line.Length + 1; //+1 for newline char
			}

		}
Esempio n. 2
0
		public async Task buildCorpusIndex(FileStream input) {
			long i = 0, line = 0, linePos = 0;
			int bit = 0;
			string indexStr, id;
			BulkRequest req = new BulkRequest();

			Stopwatch startWatch = new Stopwatch(), loopWatch = new Stopwatch();
			startWatch.Start();
			loopWatch.Start();
			
			while (bit > -1) {
				StringBuilder sb = new StringBuilder();
				linePos = i;

				while ((bit = input.ReadByte()) != '\n' && bit > -1) {
					i++;
					sb.Append((char)bit);
				}
				//+1 for new line bit
				i++;

				indexStr = sb.ToString();

				id = Regex.Match(indexStr, regPattern).Value;
				id = id.Trim("\"id\":\"".ToCharArray());

				var index = new Index { _id = id, line = line, position = linePos };
				//Console.WriteLine(index.ToString());

				string s = JsonConvert.SerializeObject(index);
				req.Include(s);

				line++;

				//Store and print stats every 1k lines
				if (line % 100000 == 0) {
					await clientIndex.Documents.BulkAsync(req);
					req = new BulkRequest();

					Console.Clear();
					Console.WriteLine("Stats\nProcessed: {0}", line);
					Console.WriteLine("Loop Time: {0}", loopWatch.Elapsed);
					Console.WriteLine("Total Time: {0}", startWatch.Elapsed);

					loopWatch.Restart();
				}
			}
		}