public static IIndexDocument FromFile(string file, IIndexDataSource parent) { string s = file.ToLower(); if (s.EndsWith(".pdf")) { // PDF return(new IndexPDFDocument(file, parent)); } else if (s.EndsWith(".txt")) { return(new IndexedTextFile(file, parent)); } else if ((s.EndsWith(".html")) || (s.EndsWith(".html"))) { using (FileStream fs = File.OpenRead(file)) { return(WebDataSource.FromHtml(fs, file, parent.Name)); } } return(null); }
protected override IIndexDocument DocumentFromItem(Document item) { // load document if (item.ContainsKey("file")) { string url = Path + item["file"]; if (!url.Contains("://")) { return(DocumentsDataSource.FromFile(url, this)); } else { return(WebDataSource.FromUrl(url, this)); } } return(null); }
/// Add document from BLOB public virtual void AddRecord(string name, Stream stream, string fields, ConcurrentQueue <IIndexDocument> queue) { bool isText = false; IIndexDocument doc = null; AddRecordBase(name, fields); if ((indexType == IndexType.File) || (indexType != IndexType.Blob)) { throw new InvalidDataException("Adding record of wrong IndexType"); } BinaryReader reader = new BinaryReader(stream); byte[] buff = new byte[4000]; reader.Read(buff, 0, 4000); String det = Encoding.UTF8.GetString(buff, 0, buff.Length); stream.Seek(0, SeekOrigin.Begin); reader.Dispose(); // detect type if ((buff[0] == '%') && (buff[1] == 'P') && (buff[2] == 'D') && (buff[3] == 'F')) { DocumentsDataSource.IndexPDFDocument pdf = new DocumentsDataSource.IndexPDFDocument(name, stream, this); if (fields != null) { pdf.headers = () => { return(fields); } } ; doc = pdf; } else if (det.Contains("<html")) { IndexPagedTextFile file = WebDataSource.FromHtml(stream, name, Name); if (fields != null) { file.SetHeaders(fields); } } else { // detect charset Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(buff, 0, buff.Length); detector.DataEnd(); if (detector.Charset != null) { Encoding enc = Portable.Text.Encoding.GetEncoding(detector.Charset); using (StreamReader sreader = new StreamReader(stream, enc, false)) { doc = new IndexPagedTextFile("", sreader.ReadToEnd(), fields != null ? fields : ""); } } } if (doc != null) { Enqueue(queue, doc); } }
static void Main(string[] args) { Console.Write("DOCODO Search Engine\nCopyrigt (c) 2018 Alexey Zakharchenko \n"); int nPort = 9001; try { nPort = Int32.Parse((from a in args where a.StartsWith("-p:") select a).Last().Substring(3)); } catch (Exception e) { } if (args.Contains("server")) { new DocodoServer(nPort); } List <Vocab> vocs = new List <Vocab>(); Console.Write("Loaded vocs: "); foreach (string file in Directory.GetFiles("Dict\\", "*.voc")) { vocs.Add(new Vocab(file)); Console.Write(file.Substring(file.LastIndexOf("\\") + 1).Split('.')[0] + " "); } if (vocs.Count == 0) { Console.Write("No!"); } Console.Write("\n"); // TODO: create voc command, like -cv:en foreach (string crvoc in (from a in args where a.StartsWith("-cv:") select a.Substring(4))) { CreateVoc(crvoc); } String basepath = "."; try { basepath = (from a in args where a.StartsWith("-i:") select a).Last().Substring(3); } catch (Exception e) { } ind = new Index(basepath, false, vocs.ToArray <Vocab>()); foreach (string source in (from a in args where a.StartsWith("-source:") select a.Substring(8))) { var spl = source.Split(','); if (spl[0].Equals("doc")) { ind.AddDataSource(new DocumentsDataSource("doc", spl[1])); } else if (spl[0].Equals("web")) { WebDataSource websource = new WebDataSource("web", spl[1], spl.Length > 2?spl[2]:""); // websource.MaxItems = 100; ind.AddDataSource(websource); } else if (spl[0].Equals("xml")) { XmlDataSource websource = new XmlDataSource("xml", spl[1]); ind.AddDataSource(websource); } else if (spl[0].Equals("mysql")) { string Connect = null; string Query = null; string FieldName = null; string BasePath = null; try { foreach (string line in File.ReadAllLines(spl[1])) { string[] name = line.Split("="); if (name[0].Equals("Connect")) { Connect = line.Substring(8); } if (name[0].Equals("Query")) { Query = line.Substring(6); } if (name[0].Equals("BasePath")) { BasePath = line.Substring(9); } if (name[0].Equals("IndexType")) { FieldName = name[1]; } } if (Connect == null) { throw new InvalidDataException("No Connect key"); } if (Query == null) { throw new InvalidDataException("No Query key"); } if (FieldName == null) { throw new InvalidDataException("No IndexType key"); } if (BasePath == null) { throw new InvalidDataException("No BasePath key"); } ind.AddDataSource(new MySqlDBDocSource("mysql_" + spl[1], BasePath, Connect, Query, DBDataSourceBase.IndexType.File, FieldName));// typeof(DBDataSourceBase.IndexType).GetEnumNames().Where(v=>v.Equals( FieldName.Split(':')[0]).,)); } catch (Exception e) { Console.WriteLine("Error adding mysql source: " + e.Message); } } } ind.LoadStopWords("Dict\\stop.txt"); foreach (string sf in (from a in args where a.StartsWith("-stops:") select a.Substring(7))) { ind.LoadStopWords(sf); } cancelationToken = ind.cancelationToken; // token to cancel something if (ind.CanSearch) { Console.WriteLine("Index loaded, contains {0} words", ind.Count); } ConsoleKey c; do { Console.WriteLine("Press " + (ind.CanIndex?"I to index, ":"") + (ind.CanSearch ? " S to search, O for info, " : "") + "V to manage vocs, E to exit..."); c = Console.ReadKey(false).Key; if (c == ConsoleKey.V) { while (true) { Console.WriteLine("-----------\nCreate vocabs\nType voc name from list below or e to exit:"); foreach (string f in Directory.GetDirectories("Dict\\").Select((s) => s.Substring(s.LastIndexOf('\\') + 1))) { Console.Write(f + ","); } Console.WriteLine(""); string line = Console.ReadLine(); if (!line.Equals("e")) { CreateVoc(line); } else { break; } } } if (c == ConsoleKey.O) { ShowInfo(); } else if (c == ConsoleKey.S) { Console.WriteLine("Type text to search, e - exit"); Console.Write("req:"); Console.InputEncoding = Encoding.Unicode;// Windows1251.GetEncoding(); string req; while (!(req = ReadSearchRequest()).Equals("e")) { Index.SearchResult result = ind.Search(req); Console.WriteLine("Found {0} pages in {1} docs:", result.foundPages.Count, result.foundDocs.Count); foreach (var d in result.foundDocs) { Console.WriteLine($"Doc: {d.Name}, Found {d.pages.Count} pages"); foreach (var p in d.pages) { Console.WriteLine($" Page {p.id} ({p.pos.Count} times)"); Console.WriteLine(" Text: " + p.text); } } Console.Write("req:"); } } else if (c == ConsoleKey.I) { Console.WriteLine($"Start Indexing ..."); // user input task Task cT = new Task(() => { do { while (Console.In.Peek() == -1) { Thread.Sleep(200); if (cancelationToken == null) { break; } } if (cancelationToken != null) { //char bc = (char); //In.Read(); if (Console.ReadKey().Key == ConsoleKey.C) { Console.WriteLine("Indexing was interrupted by user."); cancelationToken.Cancel(); break; } } }while (cancelationToken != null); }); //cT.Start(); // listen console to interrupt try { ind.CreateAsync().Wait(); } catch (OperationCanceledException e) { } catch (Exception e) { Console.WriteLine("Error creaing index"); } cancelationToken = null; Console.WriteLine("Indexing complited."); } }while (c != ConsoleKey.E); }