public TDocs Process(FileInfo fi) { TDocs tdoc = new TDocs(); tdoc.Path = fi.FullName; tdoc.Name = fi.Name; tdoc.Extension = fi.Extension.ToLower(); tdoc.Title = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.')); return(tdoc); }
/// <summary> /// search /// </summary> /// <param name="indexDir"></param> /// <param name="q">keyword</param> /// <param name="pageLen">every page's length</param> /// <param name="pageNo">page number</param> /// <param name="recCount">result number</param> /// <returns></returns> public List <TDocs> Search(String q, int pageLen, int pageNo, out int recCount) { string keywords = q; IndexSearcher search = new IndexSearcher(INDEX_DIR); q = GetKeyWordsSplitBySpace(q, new PanGuTokenizer()); QueryParser queryParser = new QueryParser("content", new PanGuAnalyzer(true)); Query query = queryParser.Parse(q); Hits hits = search.Search(query, Sort.RELEVANCE); List <TDocs> result = new List <TDocs>(); recCount = hits.Length(); int i = (pageNo - 1) * pageLen; while (i < recCount && result.Count < pageLen) { TDocs docs = null; try { docs = new TDocs(); docs.Path = hits.Doc(i).Get("path"); docs.Name = hits.Doc(i).Get("name"); docs.Title = hits.Doc(i).Get("title"); docs.Extension = hits.Doc(i).Get("ext"); //rem this item in case the search result will be too large & consume too much memory, // takes loading time, abstract is enough //docs.Content = hits.Doc(i).Get("content"); PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<color=red>", "</color>"); PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment()); highlighter.FragmentSize = 100; docs.Abstract = highlighter.GetBestFragment(keywords, hits.Doc(i).Get("content")); } catch (Exception e) { Debug.WriteLine(e.Message); } finally { result.Add(docs); i++; } } search.Close(); return(result); }
public TDocs Process(FileInfo fi) { StringBuilder strBuilder = new StringBuilder(); TDocs tdoc = new TDocs(); tdoc.Path = fi.FullName; tdoc.Name = fi.Name; tdoc.Extension = fi.Extension.ToLower(); tdoc.Title = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.')); FileStream fs = null; try { DateTime dtStart = DateTime.Now; fs = new FileStream(fi.FullName, FileMode.Open); byte[] buf = new byte[1048576];//1mb int byteRead = 0; do { if (dtStart.AddMilliseconds(timeLimit) < DateTime.Now) { break; } byteRead = fs.Read(buf, 0, buf.Length); string str = Encoding.Default.GetString(buf, 0, byteRead); strBuilder.Append(str); } while (byteRead > 0); DateTime dtEnd = DateTime.Now; TimeSpan timeInter = dtEnd - dtStart; tdoc.Content = strBuilder.ToString(); fs.Close(); } catch (Exception ex) { Debug.WriteLine(ex.Message); } finally { if (fs != null) { fs.Close(); } } return(tdoc); }
private TDocs defaultProc(FileInfo fi) { if (indexAllFileNames) { TDocs tdoc = new TDocs(); tdoc.Path = fi.FullName; tdoc.Name = fi.Name; tdoc.Extension = fi.Extension.ToLower(); tdoc.Title = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.')); tdoc.Content = ""; return(tdoc); } else { return(null); } }
private void ProcessDocsCallBack(object fi) { procMutex.WaitOne(); asyncProcCtn++; procMutex.ReleaseMutex(); TDocs tdoc = docProc.DealWithDoc((FileInfo)fi); if (tdoc == null) { procMutex.WaitOne(); asyncProcCtn--; procMutex.ReleaseMutex(); return; } DateTime dtStart = DateTime.Now; procMutex.WaitOne(); IndexFileContent(tdoc); DateTime dtEnd = DateTime.Now; TimeSpan time = dtEnd - dtStart; totalChars += tdoc.Content.Length; count++; if (count >= MaxCount) { asyncProcCtn--; return; } /* * if (count % 10 == 0) * { * writer.Close(); * CreateIndex(false); * MergeFactor = 10; * MaxBufferDocs = 1000; * MaxMergeDocs = 10000; * MaxFieldLength = 100000; * } * */ asyncProcCtn--; procMutex.ReleaseMutex(); }
public TDocs Process(FileInfo fi) { StringBuilder strBuilder = new StringBuilder(); PdfReader pdfReader = null; TDocs tdoc = new TDocs(); tdoc.Path = fi.FullName; tdoc.Name = fi.Name; tdoc.Extension = fi.Extension.ToLower(); tdoc.Title = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.')); try { DateTime dtStart = DateTime.Now; pdfReader = new PdfReader(fi.FullName); int numberOfPages = pdfReader.NumberOfPages; int i; for (i = 1; i <= numberOfPages; ++i) { if (dtStart.AddMilliseconds(timeLimit) < DateTime.Now) { break; } iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); strBuilder.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy)); } DateTime dtEnd = DateTime.Now; TimeSpan timeInter = dtEnd - dtStart; tdoc.Content = strBuilder.ToString(); } catch (Exception ex) { Debug.WriteLine(ex.Message); } finally { if (pdfReader != null) { pdfReader.Close(); } } return(tdoc); }
public int IndexFileContent(TDocs tdoc) { Document doc = new Document(); Field field = new Field("path", tdoc.Path, Field.Store.YES, Field.Index.NO); doc.Add(field); field = new Field("name", tdoc.Name, Field.Store.YES, Field.Index.TOKENIZED); doc.Add(field); field = new Field("title", tdoc.Title, Field.Store.YES, Field.Index.TOKENIZED); doc.Add(field); field = new Field("ext", tdoc.Extension.ToLower(), Field.Store.YES, Field.Index.UN_TOKENIZED); doc.Add(field); field = new Field("content", tdoc.Content, Field.Store.YES, Field.Index.TOKENIZED); doc.Add(field); writer.AddDocument(doc); int num = writer.DocCount(); return(num); }
public TDocs Process(FileInfo fi) { if (fi.Name.StartsWith("~$")) { return(null); } //default max value is 2147483647 = 2^31-1 StringBuilder strBuilder = new StringBuilder(); Word.Application app = null; bool appOpened = false; try { app = new Microsoft.Office.Interop.Word.Application(); appOpened = true; } catch (Exception ex) { appOpened = false; Debug.Write(ex.Message); } Word.Document doc = null; object unknow = Type.Missing; TDocs tdoc = new TDocs(); tdoc.Path = fi.FullName; tdoc.Name = fi.Name; tdoc.Extension = fi.Extension.ToLower(); tdoc.Title = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.')); if (!appOpened) { return(tdoc); } try { DateTime dtStart = DateTime.Now; object conf = false; app.Visible = false; object file = fi.FullName; doc = app.Documents.Open(ref file, ref conf, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow, ref unknow); int i = 0; int locaTitle = 0; string strTitle = ""; //notice that: the index of doc.Paragraphs counts from 1~Count, not starts with 0 for (i = 1; i <= doc.Paragraphs.Count; i++) { string temp = doc.Paragraphs[i].Range.Text.Trim(); if (temp == "") { continue; } //find the first line not null, it maybe the title if (locaTitle == 0) { locaTitle = i; strTitle = temp; } strBuilder.AppendLine(temp); if (dtStart.AddMilliseconds(timeLimit) < DateTime.Now) { break; } } if (strTitle != "" && strTitle.Length < WordProc.MaxTitle) { tdoc.Title = strTitle; } DateTime dtEnd = DateTime.Now; TimeSpan timeInter = dtEnd - dtStart; tdoc.Content = strBuilder.ToString(); ((Microsoft.Office.Interop.Word._Document)doc).Close(ref unknow, ref unknow, ref unknow); } catch (Exception ex) { Debug.WriteLine(ex.Message); } finally { ((Microsoft.Office.Interop.Word._Application)app).Quit(); } return(tdoc); }