Пример #1
0
        public TDocs Process(FileInfo fi)
        {
            TDocs tdoc = new TDocs();

            tdoc.Path      = fi.FullName;
            tdoc.Name      = fi.Name;
            tdoc.Extension = fi.Extension.ToLower();
            tdoc.Title     = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.'));
            return(tdoc);
        }
Пример #2
0
        /// <summary>
        /// search
        /// </summary>
        /// <param name="indexDir"></param>
        /// <param name="q">keyword</param>
        /// <param name="pageLen">every page's length</param>
        /// <param name="pageNo">page number</param>
        /// <param name="recCount">result number</param>
        /// <returns></returns>
        public List <TDocs> Search(String q, int pageLen, int pageNo, out int recCount)
        {
            string        keywords = q;
            IndexSearcher search   = new IndexSearcher(INDEX_DIR);

            q = GetKeyWordsSplitBySpace(q, new PanGuTokenizer());
            QueryParser queryParser = new QueryParser("content", new PanGuAnalyzer(true));

            Query query = queryParser.Parse(q);

            Hits hits = search.Search(query, Sort.RELEVANCE);

            List <TDocs> result = new List <TDocs>();

            recCount = hits.Length();
            int i = (pageNo - 1) * pageLen;

            while (i < recCount && result.Count < pageLen)
            {
                TDocs docs = null;

                try
                {
                    docs           = new TDocs();
                    docs.Path      = hits.Doc(i).Get("path");
                    docs.Name      = hits.Doc(i).Get("name");
                    docs.Title     = hits.Doc(i).Get("title");
                    docs.Extension = hits.Doc(i).Get("ext");
                    //rem this item in case the search result will be too large & consume too much memory,
                    //   takes loading time, abstract is enough
                    //docs.Content = hits.Doc(i).Get("content");

                    PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
                        new PanGu.HighLight.SimpleHTMLFormatter("<color=red>", "</color>");

                    PanGu.HighLight.Highlighter highlighter =
                        new PanGu.HighLight.Highlighter(simpleHTMLFormatter,
                                                        new Segment());

                    highlighter.FragmentSize = 100;
                    docs.Abstract            = highlighter.GetBestFragment(keywords, hits.Doc(i).Get("content"));
                }
                catch (Exception e)
                {
                    Debug.WriteLine(e.Message);
                }
                finally
                {
                    result.Add(docs);
                    i++;
                }
            }
            search.Close();
            return(result);
        }
Пример #3
0
        public TDocs Process(FileInfo fi)
        {
            StringBuilder strBuilder = new StringBuilder();

            TDocs tdoc = new TDocs();

            tdoc.Path      = fi.FullName;
            tdoc.Name      = fi.Name;
            tdoc.Extension = fi.Extension.ToLower();
            tdoc.Title     = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.'));
            FileStream fs = null;

            try
            {
                DateTime dtStart = DateTime.Now;
                fs = new FileStream(fi.FullName, FileMode.Open);
                byte[] buf      = new byte[1048576];//1mb
                int    byteRead = 0;
                do
                {
                    if (dtStart.AddMilliseconds(timeLimit) < DateTime.Now)
                    {
                        break;
                    }
                    byteRead = fs.Read(buf, 0, buf.Length);
                    string str = Encoding.Default.GetString(buf, 0, byteRead);
                    strBuilder.Append(str);
                } while (byteRead > 0);
                DateTime dtEnd     = DateTime.Now;
                TimeSpan timeInter = dtEnd - dtStart;
                tdoc.Content = strBuilder.ToString();
                fs.Close();
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex.Message);
            }
            finally
            {
                if (fs != null)
                {
                    fs.Close();
                }
            }
            return(tdoc);
        }
Пример #4
0
 private TDocs defaultProc(FileInfo fi)
 {
     if (indexAllFileNames)
     {
         TDocs tdoc = new TDocs();
         tdoc.Path      = fi.FullName;
         tdoc.Name      = fi.Name;
         tdoc.Extension = fi.Extension.ToLower();
         tdoc.Title     = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.'));
         tdoc.Content   = "";
         return(tdoc);
     }
     else
     {
         return(null);
     }
 }
Пример #5
0
        private void ProcessDocsCallBack(object fi)
        {
            procMutex.WaitOne();
            asyncProcCtn++;
            procMutex.ReleaseMutex();

            TDocs tdoc = docProc.DealWithDoc((FileInfo)fi);

            if (tdoc == null)
            {
                procMutex.WaitOne();
                asyncProcCtn--;
                procMutex.ReleaseMutex();
                return;
            }
            DateTime dtStart = DateTime.Now;

            procMutex.WaitOne();
            IndexFileContent(tdoc);
            DateTime dtEnd = DateTime.Now;
            TimeSpan time  = dtEnd - dtStart;

            totalChars += tdoc.Content.Length;
            count++;
            if (count >= MaxCount)
            {
                asyncProcCtn--;
                return;
            }

            /*
             * if (count % 10 == 0)
             * {
             *  writer.Close();
             *  CreateIndex(false);
             *  MergeFactor = 10;
             *  MaxBufferDocs = 1000;
             *  MaxMergeDocs = 10000;
             *  MaxFieldLength = 100000;
             * }
             * */
            asyncProcCtn--;
            procMutex.ReleaseMutex();
        }
Пример #6
0
        public TDocs Process(FileInfo fi)
        {
            StringBuilder strBuilder = new StringBuilder();
            PdfReader     pdfReader  = null;

            TDocs tdoc = new TDocs();

            tdoc.Path      = fi.FullName;
            tdoc.Name      = fi.Name;
            tdoc.Extension = fi.Extension.ToLower();
            tdoc.Title     = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.'));
            try
            {
                DateTime dtStart = DateTime.Now;
                pdfReader = new PdfReader(fi.FullName);
                int numberOfPages = pdfReader.NumberOfPages;
                int i;
                for (i = 1; i <= numberOfPages; ++i)
                {
                    if (dtStart.AddMilliseconds(timeLimit) < DateTime.Now)
                    {
                        break;
                    }
                    iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
                    strBuilder.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy));
                }
                DateTime dtEnd     = DateTime.Now;
                TimeSpan timeInter = dtEnd - dtStart;
                tdoc.Content = strBuilder.ToString();
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex.Message);
            }
            finally
            {
                if (pdfReader != null)
                {
                    pdfReader.Close();
                }
            }
            return(tdoc);
        }
Пример #7
0
        public int IndexFileContent(TDocs tdoc)
        {
            Document doc   = new Document();
            Field    field = new Field("path", tdoc.Path, Field.Store.YES, Field.Index.NO);

            doc.Add(field);
            field = new Field("name", tdoc.Name, Field.Store.YES, Field.Index.TOKENIZED);
            doc.Add(field);
            field = new Field("title", tdoc.Title, Field.Store.YES, Field.Index.TOKENIZED);
            doc.Add(field);
            field = new Field("ext", tdoc.Extension.ToLower(), Field.Store.YES, Field.Index.UN_TOKENIZED);
            doc.Add(field);
            field = new Field("content", tdoc.Content, Field.Store.YES, Field.Index.TOKENIZED);
            doc.Add(field);

            writer.AddDocument(doc);

            int num = writer.DocCount();

            return(num);
        }
Пример #8
0
        public TDocs Process(FileInfo fi)
        {
            if (fi.Name.StartsWith("~$"))
            {
                return(null);
            }
            //default max value is 2147483647 = 2^31-1
            StringBuilder strBuilder = new StringBuilder();

            Word.Application app       = null;
            bool             appOpened = false;

            try
            {
                app       = new Microsoft.Office.Interop.Word.Application();
                appOpened = true;
            }
            catch (Exception ex)
            {
                appOpened = false;
                Debug.Write(ex.Message);
            }

            Word.Document doc    = null;
            object        unknow = Type.Missing;

            TDocs tdoc = new TDocs();

            tdoc.Path      = fi.FullName;
            tdoc.Name      = fi.Name;
            tdoc.Extension = fi.Extension.ToLower();
            tdoc.Title     = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.'));

            if (!appOpened)
            {
                return(tdoc);
            }

            try
            {
                DateTime dtStart = DateTime.Now;
                object   conf    = false;
                app.Visible = false;
                object file = fi.FullName;
                doc = app.Documents.Open(ref file,
                                         ref conf, ref unknow, ref unknow, ref unknow,
                                         ref unknow, ref unknow, ref unknow, ref unknow,
                                         ref unknow, ref unknow, ref unknow, ref unknow,
                                         ref unknow, ref unknow, ref unknow);
                int    i         = 0;
                int    locaTitle = 0;
                string strTitle  = "";
                //notice that: the index of doc.Paragraphs counts from 1~Count, not starts with 0
                for (i = 1; i <= doc.Paragraphs.Count; i++)
                {
                    string temp = doc.Paragraphs[i].Range.Text.Trim();
                    if (temp == "")
                    {
                        continue;
                    }
                    //find the first line not null, it maybe the title
                    if (locaTitle == 0)
                    {
                        locaTitle = i;
                        strTitle  = temp;
                    }
                    strBuilder.AppendLine(temp);
                    if (dtStart.AddMilliseconds(timeLimit) < DateTime.Now)
                    {
                        break;
                    }
                }
                if (strTitle != "" && strTitle.Length < WordProc.MaxTitle)
                {
                    tdoc.Title = strTitle;
                }
                DateTime dtEnd     = DateTime.Now;
                TimeSpan timeInter = dtEnd - dtStart;
                tdoc.Content = strBuilder.ToString();
                ((Microsoft.Office.Interop.Word._Document)doc).Close(ref unknow, ref unknow, ref unknow);
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex.Message);
            }
            finally
            {
                ((Microsoft.Office.Interop.Word._Application)app).Quit();
            }
            return(tdoc);
        }