예제 #1
0
        public void parse_msg(LayerWriteGroup txwg, string docid, string msgtxt, out int numwords)
        {
            if (msgtxt.Length > 4 * 1024) {
                msgtxt = msgtxt.Substring(0, 4 * 1024 - 1);
            }
            // db.setValueParsed(".zdata/doc/" + docid, msgtxt);
            // gui.debugDump(db);

            #if true
                // sharptools
                SharpMessage msg = new anmar.SharpMimeTools.SharpMessage(msgtxt);
                System.Console.WriteLine("Subject: " + msg.Subject);

                indexer.index_document(txwg, docid, msg.Body, out numwords);
            #else
                // LumiSoft
                Mime msg = LumiSoft.Net.Mime.Mime.Parse(System.Text.Encoding.Default.GetBytes(msgtxt));
                System.Console.WriteLine("Subject: " + msg.MainEntity.Subject);
                indexer.index_document(txwg, docid, msg.MainEntity.DataText, out numwords);
            #endif

            //foreach (SharpAttachment msgpart in msg.Attachments) {
            //    if (msgpart.MimeTopLevelMediaType == MimeTopLevelMediaType.text &&
            //        msgpart.MimeMediaSubType == "plain") {
            //        System.Console.WriteLine("Attachment: " + msgpart.Size);
            //   }
            //}
        }
예제 #2
0
        // generate a bunch of data and insert it
        public void parse_email_messages()
        {
            LayerWriteGroup txwg = new LayerWriteGroup(db, type: LayerWriteGroup.WriteGroupType.MEMORY_ONLY);

            List<string> word_list = new List<String>();

            for (int docnum = 0; docnum < 100; docnum++) {
                for (int wordnum = 0; wordnum < 10000; wordnum++) {
                    int word = id_gen.Next(5000);
                    string word_s = "" + word;
                    word_list.Add(word_s);
                }

                // for each msg, do this
                indexer.index_document(txwg, "" + docnum, word_list);
                // indexer.index_document(txwg, docid, msg.MainEntity.DataText);
                if (docnum % 10 == 0) {
                    System.Console.WriteLine("doc {0}", docnum);
                    gui.debugDump(db);
                }

            }
        }
예제 #3
0
        public void parse_email_messages()
        {
            string basepath = @"c:\EmailTest\Data";

            // http://www.csharp-examples.net/get-files-from-directory/
            string[] filePaths = Directory.GetFiles(basepath);

            long doc_count = 1;
            long word_count = 1;
            DateTime start = DateTime.Now;

            foreach (var fn in filePaths) {
                String fullpath = Path.Combine(basepath, fn);
                System.Console.WriteLine(fullpath);
                FileStream r = File.Open(fullpath, FileMode.Open, FileAccess.Read, FileShare.Read);
                BufferedStream reader = new BufferedStream(r);
                // http://msdn.microsoft.com/en-us/library/system.io.streamreader.readline.aspx

                List<string> lines = new List<string>();
                LayerWriteGroup txwg = new LayerWriteGroup(db,type:LayerWriteGroup.WriteGroupType.MEMORY_ONLY);

                while (reader.Position < reader.Length - 1) {
                    string line = UnixReadLine(reader);
                    if (line.Length > 6 && line.Substring(0, 5) == "From ") {
                        if (lines.Count > 0) {
                            string msg = String.Join("\n", lines);
                            doc_count++;

                            string docid = fullpath + ":" + doc_count;
                            int doc_numwords;
                            parse_msg(txwg, docid, msg, out doc_numwords);
                            word_count += doc_numwords;

                            DateTime cur = DateTime.Now;
                            double  elapsed_s = (cur - start).TotalSeconds;
                            Console.WriteLine("doc{0}: {1}    elapsed:{2}    docs/sec:{3}    words/sec:{4}",
                                doc_count, docid, elapsed_s, (float)doc_count / elapsed_s, (float)word_count / elapsed_s);
                            gui.debugDump(db);

                            // end after a certain number of lines...
                            if (doc_count > 4000000000) {
                                goto end_now;
                            }

                        }
                        lines = new List<string>();
                    } else {
                        lines.Add(line);
                    }

                    if (doc_count % 50000 == 0) { gui.debugDump(db); }

                } // while adding docs

            } // foreach file

            Console.WriteLine("=================== EmailInjector end... time to fully optimize...");

            end_now:
            // we have to lock to assure we don't collide with the background merge thread
            lock (db) {
                // be sure to flush and merge before we search...
                db.flushWorkingSegment();
                gui.debugDump(db);
                for (int x = 0; x < 40; x++) {
                    var mc = db.rangemapmgr.mergeManager.getBestCandidate();
                    gui.debugDump(db, mc);
                    if (mc == null) { break; }
                    db.performMerge(mc);
                    gui.debugDump(db);
                }
            }
        }