public void parse_msg(LayerWriteGroup txwg, string docid, string msgtxt, out int numwords) { if (msgtxt.Length > 4 * 1024) { msgtxt = msgtxt.Substring(0, 4 * 1024 - 1); } // db.setValueParsed(".zdata/doc/" + docid, msgtxt); // gui.debugDump(db); #if true // sharptools SharpMessage msg = new anmar.SharpMimeTools.SharpMessage(msgtxt); System.Console.WriteLine("Subject: " + msg.Subject); indexer.index_document(txwg, docid, msg.Body, out numwords); #else // LumiSoft Mime msg = LumiSoft.Net.Mime.Mime.Parse(System.Text.Encoding.Default.GetBytes(msgtxt)); System.Console.WriteLine("Subject: " + msg.MainEntity.Subject); indexer.index_document(txwg, docid, msg.MainEntity.DataText, out numwords); #endif //foreach (SharpAttachment msgpart in msg.Attachments) { // if (msgpart.MimeTopLevelMediaType == MimeTopLevelMediaType.text && // msgpart.MimeMediaSubType == "plain") { // System.Console.WriteLine("Attachment: " + msgpart.Size); // } //} }
// generate a bunch of data and insert it public void parse_email_messages() { LayerWriteGroup txwg = new LayerWriteGroup(db, type: LayerWriteGroup.WriteGroupType.MEMORY_ONLY); List<string> word_list = new List<String>(); for (int docnum = 0; docnum < 100; docnum++) { for (int wordnum = 0; wordnum < 10000; wordnum++) { int word = id_gen.Next(5000); string word_s = "" + word; word_list.Add(word_s); } // for each msg, do this indexer.index_document(txwg, "" + docnum, word_list); // indexer.index_document(txwg, docid, msg.MainEntity.DataText); if (docnum % 10 == 0) { System.Console.WriteLine("doc {0}", docnum); gui.debugDump(db); } } }
public void parse_email_messages() { string basepath = @"c:\EmailTest\Data"; // http://www.csharp-examples.net/get-files-from-directory/ string[] filePaths = Directory.GetFiles(basepath); long doc_count = 1; long word_count = 1; DateTime start = DateTime.Now; foreach (var fn in filePaths) { String fullpath = Path.Combine(basepath, fn); System.Console.WriteLine(fullpath); FileStream r = File.Open(fullpath, FileMode.Open, FileAccess.Read, FileShare.Read); BufferedStream reader = new BufferedStream(r); // http://msdn.microsoft.com/en-us/library/system.io.streamreader.readline.aspx List<string> lines = new List<string>(); LayerWriteGroup txwg = new LayerWriteGroup(db,type:LayerWriteGroup.WriteGroupType.MEMORY_ONLY); while (reader.Position < reader.Length - 1) { string line = UnixReadLine(reader); if (line.Length > 6 && line.Substring(0, 5) == "From ") { if (lines.Count > 0) { string msg = String.Join("\n", lines); doc_count++; string docid = fullpath + ":" + doc_count; int doc_numwords; parse_msg(txwg, docid, msg, out doc_numwords); word_count += doc_numwords; DateTime cur = DateTime.Now; double elapsed_s = (cur - start).TotalSeconds; Console.WriteLine("doc{0}: {1} elapsed:{2} docs/sec:{3} words/sec:{4}", doc_count, docid, elapsed_s, (float)doc_count / elapsed_s, (float)word_count / elapsed_s); gui.debugDump(db); // end after a certain number of lines... if (doc_count > 4000000000) { goto end_now; } } lines = new List<string>(); } else { lines.Add(line); } if (doc_count % 50000 == 0) { gui.debugDump(db); } } // while adding docs } // foreach file Console.WriteLine("=================== EmailInjector end... time to fully optimize..."); end_now: // we have to lock to assure we don't collide with the background merge thread lock (db) { // be sure to flush and merge before we search... db.flushWorkingSegment(); gui.debugDump(db); for (int x = 0; x < 40; x++) { var mc = db.rangemapmgr.mergeManager.getBestCandidate(); gui.debugDump(db, mc); if (mc == null) { break; } db.performMerge(mc); gui.debugDump(db); } } }