internal IndexedFieldLookup( IEnumerable <IFieldReader> fieldReaders, ITextExtractor defaultTextExtractor, ITokenizer defaultTokenizer) { if (fieldReaders is null) { throw new ArgumentNullException(nameof(fieldReaders)); } if (defaultTextExtractor is null) { throw new ArgumentNullException(nameof(defaultTextExtractor)); } if (defaultTokenizer is null) { throw new ArgumentNullException(nameof(defaultTokenizer)); } foreach (var field in fieldReaders) { this.RegisterField(field, defaultTextExtractor, defaultTokenizer); } }
public LuceneSearchService(ITextExtractor pdfTextExtractor) { _pdfTextExtractor = pdfTextExtractor ?? Locator.Current.GetService <ITextExtractor>(); _indexPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "index"); _documentsPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "files"); }
internal FullTextIndex( IndexOptions indexOptions, ConfiguredObjectTokenizationOptions <TKey> itemTokenizationOptions, IIndexNodeFactory indexNodeFactory, IQueryParser queryParser, IIndexScorerFactory scorer, ITextExtractor defaultTextExtractor, ITokenizer defaultTokenizer, Func <IIndexSnapshot <TKey>, Task>[]?indexModifiedActions) { this.indexNavigatorPool = new IndexNavigatorPool(scorer); this.indexOptions = indexOptions; this.itemTokenizationOptions = itemTokenizationOptions ?? throw new ArgumentNullException(nameof(itemTokenizationOptions)); this.IndexNodeFactory = indexNodeFactory ?? throw new ArgumentNullException(nameof(indexNodeFactory)); this.queryParser = queryParser ?? throw new ArgumentNullException(nameof(queryParser)); this.DefaultTextExtractor = defaultTextExtractor; this.DefaultTokenizer = defaultTokenizer ?? throw new ArgumentNullException(nameof(defaultTokenizer)); this.indexModifiedActions = indexModifiedActions; this.idPool = new IdPool <TKey>(); this.FieldLookup = new IndexedFieldLookup( this.itemTokenizationOptions.GetAllConfiguredFields(), defaultTextExtractor, defaultTokenizer); this.Root = this.IndexNodeFactory.CreateRootNode(); }
public ImprovedTranslateCommandHandler(IEducationProfileDownloader downloader, ITextExtractor textExtractor, ITranslator translator, ITranslationPersister persister) { _downloader = downloader ?? throw new ArgumentNullException(nameof(downloader)); _textExtractor = textExtractor ?? throw new ArgumentNullException(nameof(textExtractor)); _translator = translator ?? throw new ArgumentNullException(nameof(translator)); _persister = persister ?? throw new ArgumentNullException(nameof(persister)); }
public UploadImageCommand(ContinuityManager continuityManager, ITextExtractor textExtractor, IBitmapUtility bitmapUtility, IImageRatioValidator imageRatioValidator) { this.continuityManager = continuityManager; this.textExtractor = textExtractor; this.bitmapUtility = bitmapUtility; this.imageRatioValidator = imageRatioValidator; }
public IEnumerable <string> GetResult(IndexerFile file) { IndexerResult indexerResult = new IndexerResult(); _configuration.Splitter.SetResultPhase(true); if (!_textExtractors.ContainsKey(file.Extension)) { string message = "No extractor is defined for file extension: " + file.Extension + "."; throw new Exception(message); } ITextExtractor textExtractor = _textExtractors[file.Extension]; string fileText = File.ReadAllText(file.Path); foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType)) { try { IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file); identifierSplitResult.Add(_configuration.Splitter.Split(identifier)); indexerResult.AddSplitResult(identifierSplitResult); } catch (Exception) { continue; } } return(indexerResult.GetSplitResultList().SelectMany(x => x.Splits).Select(Filter).Where(x => !string.IsNullOrEmpty(x))); }
private static string ParseFile(ITextExtractor textExtractor, string filename) { // Images: https://www.google.se/search?q=tiff+ifilter&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-US:official&client=firefox-a&channel=fflb#hl=sv&client=firefox-a&hs=Cis&tbo=d&rls=org.mozilla:en-US%3Aofficial&channel=fflb&sclient=psy-ab&q=image+ifilter&oq=image+ifilter&gs_l=serp.3..0i13l4.4693.4693.1.4936.1.1.0.0.0.0.55.55.1.1.0...0.0...1c.1.k9PU0a3S1m4&pbx=1&bav=on.2,or.r_gc.r_pw.r_qf.&bvm=bv.1355534169,d.bGE&fp=71792659602ba5ba&bpcl=40096503&biw=1680&bih=919 // http://technet.microsoft.com/sv-se/library/dd834685.aspx // http://technet.microsoft.com/en-us/library/dd744701%28v=ws.10%29.aspx // var filename = Environment.CurrentDirectory + "\\test.pdf"; // x64, working: http://www.adobe.com/support/downloads/thankyou.jsp?ftpID=4025&fileID=3941 // http://www.adobe.com/support/downloads/detail.jsp?ftpID=2611 // http://www.foxitsoftware.com/products/ifilter/ // http://www.microsoft.com/en-us/download/details.aspx?id=3988 // var filename = Environment.CurrentDirectory + "\\test.docx"; // http://www.microsoft.com/en-us/download/details.aspx?id=20109 // http://www.microsoft.com/en-us/download/details.aspx?id=17062 // var filename = Environment.CurrentDirectory + "\\test.txt"; /* try { using (var stream = File.OpenRead(filename)) { return textExtractor.GetText(stream); } } catch(Exception exc) { Console.WriteLine("Exception was thrown while reading '" + filename + "'"); Console.WriteLine("Exception: " + exc.Message); return ""; } */ return textExtractor.GetText(filename); }
public SearchService(IUserDao userDao, ICuyahogaContextProvider cuyahogaContextProvider, ITextExtractor textExtractor, IContentItemService<IContentItem> contentItemService) { this._userDao = userDao; this._cuyahogaContextProvider = cuyahogaContextProvider; this._textExtractor = textExtractor; this._contentItemService = contentItemService; }
public SearchService(IUserDao userDao, ICuyahogaContextProvider cuyahogaContextProvider, ITextExtractor textExtractor, IContentItemService <IContentItem> contentItemService) { this._userDao = userDao; this._cuyahogaContextProvider = cuyahogaContextProvider; this._textExtractor = textExtractor; this._contentItemService = contentItemService; }
private Document BuildDocumentFromContentItem(IContentItem contentItem, ITextExtractor textExtractor) { ISearchableContent searchInfo = contentItem as ISearchableContent; if (searchInfo == null) { throw new ArgumentException("Argument must implement ISearchableContent"); } // Get the text of the content item to index string contentToIndex = searchInfo.ToSearchContent(textExtractor); // strip (x)html tags string plainTextContent = System.Text.RegularExpressions.Regex.Replace(contentToIndex, @"<(.|\n)*?>", string.Empty); // create the actual url string path = contentItem.GetContentUrl(); // check that summary is not null. string summary = contentItem.Summary ?? Text.TruncateText(plainTextContent, 200); Document doc = new Document(); doc.Add(new Field("globalid", contentItem.GlobalId.ToString("N"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("title", contentItem.Title, Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("summary", summary, Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("contents", plainTextContent, Field.Store.NO, Field.Index.TOKENIZED)); doc.Add(new Field("author", contentItem.CreatedBy.FullName, Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("moduletype", contentItem.Section.ModuleType.Name, Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("site", contentItem.Section.Node.Site.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("datecreated", contentItem.CreatedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("datemodified", contentItem.ModifiedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED)); if (contentItem.PublishedAt.HasValue) { doc.Add(new Field("datepublished", contentItem.PublishedAt.Value.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED)); } // do not index the sectionid here (since it's used for access filtering) doc.Add(new Field("sectionid", contentItem.Section.Id.ToString(), Field.Store.YES, Field.Index.NO)); foreach (Category cat in contentItem.Categories) { doc.Add(new Field("category", cat.Name, Field.Store.YES, Field.Index.UN_TOKENIZED)); } foreach (Role viewRole in contentItem.ViewRoles) { doc.Add(new Field("viewroleid", viewRole.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); } foreach (CustomSearchField field in searchInfo.GetCustomSearchFields()) { Field.Store store = field.IsStored ? Field.Store.YES : Field.Store.NO; Field.Index index = field.IsTokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED; if (field.FieldKey != null && field.FieldValue != null) { doc.Add(new Field(field.FieldKey, field.FieldValue, store, index)); } } return(doc); }
public FileResourceService(IFileService fileService, ICuyahogaContextProvider cuyahogaContextProvider, IContentItemService<FileResource> contentItemService, ICommonDao commonDao, ITextExtractor textExtractor) { this._commonDao = commonDao; this._fileService = fileService; this._contentItemService = contentItemService; this._cuyahogaContextProvider = cuyahogaContextProvider; this._textExtractor = textExtractor; }
public FileResourceService(IFileService fileService, ICuyahogaContextProvider cuyahogaContextProvider, IContentItemService <FileResource> contentItemService, ICommonDao commonDao, ITextExtractor textExtractor) { this._commonDao = commonDao; this._fileService = fileService; this._contentItemService = contentItemService; this._cuyahogaContextProvider = cuyahogaContextProvider; this._textExtractor = textExtractor; }
/// <summary> /// Default constructor. /// </summary> /// <param name="physicalIndexDir">Location of the index files.</param> /// <param name="rebuildIndex">Flag to indicate if the index should be rebuilt. /// <param name="textExtractor">The text extractor that can be used to extract text from content.</param> public IndexBuilder(string physicalIndexDir, bool rebuildIndex, ITextExtractor textExtractor) { this._indexDirectory = FSDirectory.GetDirectory(physicalIndexDir, false); this._rebuildIndex = rebuildIndex; this._textExtractor = textExtractor; InitIndexWriter(); log.Info("IndexBuilder created."); }
public async Task HandleAsync_uses_translator_to_translate_text([Frozen] ITextExtractor extractor, [Frozen] ITranslator translator, ImprovedTranslateCommandHandler sut, IDispatcher dispatcher, ICommandContext <TranslateEducationCommand> context, string[] paragraphs) { Mock.Get(extractor).Setup(p => p.ExtractText(It.IsAny <string>())).Returns(paragraphs); await sut.HandleAsync(dispatcher, context); foreach (var text in paragraphs) { Mock.Get(translator).Verify(p => p.TranslateText(text, context.Command.ToLanguage)); } }
public async Task HandleAsync_uses_extractor_to_extract_paragraphs([Frozen] IEducationProfileDownloader downloader, [Frozen] ITextExtractor extractor, ImprovedTranslateCommandHandler sut, IDispatcher dispatcher, ICommandContext <TranslateEducationCommand> context, string content) { Mock.Get(downloader).Setup(p => p.GetProfile(It.IsAny <int>())).ReturnsAsync(content); await sut.HandleAsync(dispatcher, context); Mock.Get(extractor).Verify(p => p.ExtractText(content)); }
public IndexedDocument(DocumentModel uploadedDocument, User user, DocumentIndexator documentIndexator) { var uploadedFile = uploadedDocument.File; FileName = Path.GetFileNameWithoutExtension(uploadedFile.FileName); CreatedDate = DateTime.Now; UserId = user.Id; ITextExtractor textExtractor = TextExtractorFactory.GetTextExtractor(uploadedDocument.DocumentExtension); Content = textExtractor.ExtractText(uploadedDocument); WordFrequency = documentIndexator.ExtractWordFrequency(Content + ' ' + FileName); }
/// <summary> /// Updates dictionary /// </summary> private void UpdateTokenDictionary() { _configuration.Splitter.SetResultPhase(false); // first create a dictionary for tokens // extracts all text from source code int totalFileCount = _configuration.FilesToScan.Count; int currentFileCount = 0; foreach (IndexerFile file in _configuration.FilesToScan) { try { _configuration.NotificationHandler.UpdateStatus(NotificationType.AnalyzingFile, currentFileCount, totalFileCount, "Extracting file: " + file.Name); if (!_textExtractors.ContainsKey(file.Extension)) { string message = "No extractor is defined for file extension: " + file.Extension + ". Do you want to skip this file?"; if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message)) { continue; } } ITextExtractor textExtractor = _textExtractors[file.Extension]; string fileText = File.ReadAllText(file.Path); foreach (string identifier in textExtractor.Extract(fileText)) { _configuration.NotificationHandler.UpdateStatus(NotificationType.IdentifyingToken, currentFileCount, totalFileCount, "Analyzing token: " + identifier + " in file: " + file.Name); _configuration.Splitter.UpdateTokenDictionary(identifier); } } catch (Exception e) { string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name + Environment.NewLine + "Message: " + e.Message + Environment.NewLine + "Do you want to skip this file?"; if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage)) { throw; } } finally { currentFileCount++; } } }
public PaseLista(Grupo grupo) { this.grupo = grupo; this.textExtractor = new DefaultTextExtractor(1); this.repositorioAlumno = new RepositorioAlumno(); this.repositorioSesiones = new RepositorioSesiones(); this.tableLabels = new List <Label>(); this.tableButtons = new List <Button>(); this.alumnosPresentes = new Dictionary <string, Alumno>(); this.alumnosAusentes = new Dictionary <string, Alumno>(); this.labelMap = new Dictionary <string, Label>(); this.nameTokensMap = new Dictionary <string, string[]>(); this.labelWidth = 780; this.labelHeight = 24; InitializeComponent(); CargarAlumnos(); PintarAlumnos(); textExtractor.StartWorking(); }
private static void ParseFiles(ITextExtractor textExtractor, IEnumerable<FileInfo> files) { var parsableFiles = files.Where(f => textExtractor.IsParseable(f.FullName)).ToArray(); var timer = new Stopwatch(); timer.Start(); foreach (var fileInfo in parsableFiles) { Console.WriteLine("{0}{0}---------------------------------------", Environment.NewLine); Console.WriteLine("Parsing file: {0}{1}", fileInfo.Name, Environment.NewLine); var result = ParseFile(textExtractor, fileInfo.FullName); Console.WriteLine("Parsed result is: "); Console.Write(result); } timer.Stop(); Console.WriteLine("{0}{0}Processed in {1}", Environment.NewLine, timer.Elapsed); }
private void RegisterField(IFieldReader fieldOptions, ITextExtractor defaultTextExtractor, ITokenizer defaultTokenizer) { var fieldName = fieldOptions.Name; if (this.fieldToDetailsLookup.ContainsKey(fieldOptions.Name)) { throw new LiftiException(ExceptionMessages.FieldNameAlreadyUsed, fieldName); } var newId = Interlocked.Increment(ref nextId); if (newId > byte.MaxValue) { throw new LiftiException(ExceptionMessages.MaximumDistinctFieldsIndexReached); } var id = (byte)newId; var fieldTokenizer = fieldOptions.Tokenizer ?? defaultTokenizer; var textExtractor = fieldOptions.TextExtractor ?? defaultTextExtractor; this.fieldToDetailsLookup[fieldName] = new IndexedFieldDetails((byte)id, textExtractor, fieldTokenizer); this.idToFieldLookup[id] = fieldName; }
public JoinableList(ITextExtractor textExtractor) : this(new ArrayList(), m_DefaultDelimiter, textExtractor) { }
public JoinableList(string delimiter, ITextExtractor textExtractor) : this(new ArrayList(), delimiter, textExtractor) { }
/// <summary> /// Get Result /// </summary> /// <returns>Indexer Result</returns> private IndexerResult GetResult() { IndexerResult indexerResult = new IndexerResult(); _configuration.Splitter.SetResultPhase(true); // extract int totalFileCount = _configuration.FilesToScan.Count; int currentFileCount = 0; foreach (IndexerFile file in _configuration.FilesToScan) { try { _configuration.NotificationHandler.UpdateStatus(NotificationType.ReadingFileForIdentifiers, currentFileCount, totalFileCount, "Extracting file identifier: " + file.Name); if (!_textExtractors.ContainsKey(file.Extension)) { string message = "No extractor is defined for file extension: " + file.Extension + ". Do you want to skip this file?"; if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message)) { continue; } } ITextExtractor textExtractor = _textExtractors[file.Extension]; string fileText = File.ReadAllText(file.Path); foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType)) { _configuration.NotificationHandler.UpdateStatus(NotificationType.Splitting, currentFileCount, totalFileCount, "Splitting token: " + identifier + " in file: " + file.Name); IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file); identifierSplitResult.Add(_configuration.Splitter.Split(identifier)); indexerResult.AddSplitResult(identifierSplitResult); } } catch (Exception e) { string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name + Environment.NewLine + "Message: " + e.Message + Environment.NewLine + "Do you want to skip this file?"; if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage)) { throw; } } finally { currentFileCount++; } } // Since while adding result we do not have merged token, misspelled and stemmed word info, filter them and add to respective list indexerResult.UpdateFromMergeToken(_tokenDictionary); indexerResult.UpdateFromMisspelled(_tokenDictionary); indexerResult.UpdateFromStemmed(_tokenDictionary); // Filter 3: Stem every identified. If the word is identified replace the word with stemmed word if (_configuration.Stemmer != null) { List <string> dictionaryWordList = indexerResult.GetDictionaryWordList().Keys.ToList(); int totalIdentifiedCount = dictionaryWordList.Count; int currentIdentifiedCount = 0; foreach (string identified in dictionaryWordList) { currentIdentifiedCount++; _configuration.NotificationHandler.UpdateStatus(NotificationType.Stemming, currentIdentifiedCount, totalIdentifiedCount, "Stemming: " + identified); string stemmedText = _configuration.Stemmer.GetStemmedText(identified); if (stemmedText != null && stemmedText != identified && _configuration.Dictionary.IsWord(stemmedText)) { indexerResult.AddStemmedWordAndReplaceIdentified(identified, stemmedText); } } } // Filter result indexerResult.RemoveFilterWordAndTokenResult(_configuration.Dictionary); _configuration.NotificationHandler.UpdateStatus(NotificationType.IndexingCompleted, 1, 1, "Indexing Completed"); return(indexerResult); }
public DssIndexService(ILuceneIndexService <IndexedDocument> luceneIndexService, ITextExtractor textExtractor, IRepository <Document> documentsRepository) { _luceneIndexService = luceneIndexService; _textExtractor = textExtractor; _documentsRepository = documentsRepository; }
/// <summary> /// Get the full contents of this ContentItem for indexing /// </summary> /// <returns></returns> public virtual string ToSearchContent(ITextExtractor textExtractor) { return this._content; }
private void formatOkay_Click(object sender, RoutedEventArgs e) { if (baseTree == null) { MessageBox.Show("Please select a content tree for the data tree."); return; } if (formatBox.SelectedIndex == -1) { formatBox.BorderBrush = Brushes.Red; return; } if (string.IsNullOrEmpty(documentFormatBox.Text)) { documentFormatBox.BorderBrush = Brushes.Red; return; } OpenFileDialog ofd = new OpenFileDialog(); ofd.FileName = "Tree"; ofd.DefaultExt = ".txt"; Nullable <bool> result = ofd.ShowDialog(); if (result == true) { string filename = ofd.FileName; documentLabel.Content = filename + "datatrees"; using (Ookii.Dialogs.Wpf.ProgressDialog dial = new ProgressDialog()) { dial.ProgressBarStyle = ProgressBarStyle.MarqueeProgressBar; dial.Show(); dial.Description = "Analyzing text..."; IIO io = new FileIO(); ITextExtractor it = null; switch (formatBox.SelectedIndex) { case 0: string text = io.ReadSource(filename); it = new XMLTextExtractor(text, documentFormatBox.Text); break; case 1: var texts = io.ReadSourceIterable(filename); it = new BeginMarkerExtraction(texts, documentFormatBox.Text); break; default: throw new InvalidOperationException(); } documents = new ObservableCollection <string>(); while (it.HasNextContent()) { string content = it.FindNextContent(); string name = Helpers.GetNameWhenFirst(content); documents.Add(name); IDataTree tree = DataTreeBuilder.CreateDocumentMappedTree(baseTree); DataTreeBuilder.AddToDataTree(tree, content); ITreeIO tio = new TreeIO(); tio.SaveDataTree(tree, filename + @"datatrees\" + name + ".dtree"); } documentList.ItemsSource = documents; } } buildDataTreePopup.IsOpen = false; }
public static string GetExtract(Stream stream, string fileName, out string errorMessage) { if (stream == null) { errorMessage = null; return(String.Empty); } if (stream.Length == 0) { errorMessage = null; return(String.Empty); } if (String.IsNullOrEmpty(fileName)) { errorMessage = "Cannot resolve a TextExtractor if FileName is null or empty"; return(String.Empty); } var extension = Path.GetExtension(fileName); if (String.IsNullOrEmpty(extension)) { errorMessage = "Cannot resolve a TextExtractor if FileName's extension is null or empty"; return(string.Empty); } extension = extension.TrimStart('.'); if (extension.Length == 0) { errorMessage = "Cannot resolve a TextExtractor if FileName's extension is empty"; return(string.Empty); } extension = extension.ToLower(); if (extension == "txt") { errorMessage = null; return(SenseNet.ContentRepository.Tools.GetStreamString(stream)); } ITextExtractor extractor = null; var result = string.Empty; switch (extension) { case "contenttype": case "xml": extractor = new XmlTextExtractor(); break; case "doc": extractor = new DocTextExtractor(); break; case "xls": extractor = new XlsTextExtractor(); break; case "pdf": extractor = new PdfTextExtractor(); break; case "docx": extractor = new DocxTextExtractor(); break; case "xlsx": extractor = new XlsxTextExtractor(); break; case "pptx": extractor = new PptxTextExtractor(); break; case "txt": extractor = new PlainTextExtractor(); break; default: errorMessage = String.Format("Cannot resolve a TextExtractor for this extension: '{0}'", extension); return(String.Empty); } try { //-- sync result = extractor.Extract(stream); errorMessage = null; ////-- async /* * Action<TimeboxedActivity> timeboxedFunctionCall = activity => * { * var x = (Stream)activity.InArgument; * var extract = extractor.Extract(x); * activity.OutArgument = extract; * }; * * var act = new TimeboxedActivity(); * act.InArgument = stream; * act.Activity = timeboxedFunctionCall; * * var finishedWithinTime = act.ExecuteAndWait(5000); * if (!finishedWithinTime) * { * act.Abort(); * errorMessage = String.Format("Text extracting timeout. path: {0}", fileName); * return String.Empty; * } * else if (act.ExecutionException != null) * { * errorMessage = String.Format("An error occured during extracting text. Path: {0}. Message: {1}", fileName, act.ExecutionException.Message); * } * else * { * result = (string)act.OutArgument; * errorMessage = null; * } */ } catch (Exception e) { errorMessage = String.Format("An error occured during extracting text. Path: {0}. Message: {1}", fileName, e.Message); } if (String.IsNullOrEmpty(result)) { var format = @"Couldn't extract text. FileName: '{0}' "; errorMessage = String.Format(CultureInfo.InvariantCulture, format, fileName); } result = result.Replace('\0', '.'); return(result); }
private Document BuildDocumentFromContentItem(IContentItem contentItem, ITextExtractor textExtractor) { ISearchableContent searchInfo = contentItem as ISearchableContent; if (searchInfo == null) throw new ArgumentException("Argument must implement ISearchableContent"); // Get the text of the content item to index string contentToIndex = searchInfo.ToSearchContent(textExtractor); // strip (x)html tags string plainTextContent = System.Text.RegularExpressions.Regex.Replace(contentToIndex, @"<(.|\n)*?>", string.Empty); // create the actual url string path = contentItem.GetContentUrl(); // check that summary is not null. string summary = contentItem.Summary ?? Text.TruncateText(plainTextContent, 200); Document doc = new Document(); doc.Add(new Field("globalid", contentItem.GlobalId.ToString("N"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("title", contentItem.Title, Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("summary", summary, Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("contents", plainTextContent, Field.Store.NO, Field.Index.TOKENIZED)); doc.Add(new Field("author", contentItem.CreatedBy.FullName, Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("moduletype", contentItem.Section.ModuleType.Name, Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("site", contentItem.Section.Node.Site.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("datecreated", contentItem.CreatedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("datemodified", contentItem.ModifiedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED)); if (contentItem.PublishedAt.HasValue) { doc.Add(new Field("datepublished", contentItem.PublishedAt.Value.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED)); } // do not index the sectionid here (since it's used for access filtering) doc.Add(new Field("sectionid", contentItem.Section.Id.ToString(), Field.Store.YES, Field.Index.NO)); foreach (Category cat in contentItem.Categories) { doc.Add(new Field("category", cat.Name, Field.Store.YES, Field.Index.UN_TOKENIZED)); } foreach (Role viewRole in contentItem.ViewRoles) { doc.Add(new Field("viewroleid", viewRole.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); } foreach (CustomSearchField field in searchInfo.GetCustomSearchFields()) { Field.Store store = field.IsStored ? Field.Store.YES : Field.Store.NO; Field.Index index = field.IsTokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED; if (field.FieldKey != null && field.FieldValue != null) { doc.Add(new Field(field.FieldKey, field.FieldValue, store, index)); } } return doc; }
/// <summary> /// Get the full contents of the ContentItem for indexing. /// </summary> /// <returns></returns> /// <remarks> /// /// </remarks> public virtual string ToSearchContent(ITextExtractor textExtractor) { return(textExtractor.ExtractTextFromFile(this._physicalFilePath)); }
public virtual string ToSearchContent(ITextExtractor textExtractor) { return(this._content); }
/// <summary> /// Configures the index to use a text extraction process when indexing text. This is useful when /// source text contains markup, e,g. for XML/HTML you can use the <see cref="XmlTextExtractor"/>. /// </summary> public FullTextIndexBuilder <TKey> WithTextExtractor(ITextExtractor textExtractor) { this.defaultTextExtractor = textExtractor; return(this); }
public FileActions(ITextExtractor textExtractor) { _textExtractor = textExtractor; }
/// <summary> /// Get the full contents of the ContentItem for indexing. /// </summary> /// <returns></returns> /// <remarks> /// /// </remarks> public virtual string ToSearchContent(ITextExtractor textExtractor) { return textExtractor.ExtractTextFromFile(this._physicalFilePath); }
public JoinableList(IList list, ITextExtractor textExtractor) : this(list, m_DefaultDelimiter, textExtractor) { }
public static string GetExtract(BinaryData binaryData, Node node) { if (binaryData == null) { return(string.Empty); } var fname = binaryData.FileName; if (fname == null) { return(string.Empty); } var ext = fname.Extension; if (String.IsNullOrEmpty(ext)) { return(string.Empty); } ITextExtractor extractor = null; var result = string.Empty; switch (ext.ToLower()) { case "contenttype": case "xml": extractor = new XmlTextExtractor(); break; case "doc": extractor = new DocTextExtractor(); break; case "xls": extractor = new XlsTextExtractor(); break; case "pdf": extractor = new PdfTextExtractor(); break; case "docx": extractor = new DocxTextExtractor(); break; case "xlsx": extractor = new XlsxTextExtractor(); break; case "pptx": extractor = new PptxTextExtractor(); break; case "txt": extractor = new PlainTextExtractor(); break; default: return(String.Empty); } var stream = binaryData.GetStream(); if (stream == null) { return(String.Empty); } if (stream.Length == 0) { return(String.Empty); } try { ////-- sync //result = extractor.Extract(stream); //-- async Action <TimeboxedActivity> timeboxedFunctionCall = activity => { var x = (Stream)activity.InArgument; var extract = extractor.Extract(x); activity.OutArgument = extract; }; var act = new TimeboxedActivity(); act.InArgument = stream; act.Activity = timeboxedFunctionCall; act.Context = HttpContext.Current; var finishedWithinTime = act.ExecuteAndWait(Repository.TextExtractTimeout * 1000); if (!finishedWithinTime) { act.Abort(); var msg = String.Format("Text extracting timeout. Version: {0}, path: {1}", node.Version, node.Path); Logger.WriteWarning(msg); return(String.Empty); } else if (act.ExecutionException != null) { WriteError(act.ExecutionException, node); } else { result = (string)act.OutArgument; } } catch (Exception e) { WriteError(e, node); } if (String.IsNullOrEmpty(result)) { var format = @"Couldn't extract text. VersionId: {0}, path: '{1}' "; var inf = String.Format(CultureInfo.InvariantCulture, format, node.VersionId, node.Path); Logger.WriteWarning(inf); } result = result.Replace('\0', '.'); return(result); }
public JoinableList(IList list, string delimiter, ITextExtractor textExtractor) : base(list) { m_Delimiter = delimiter; m_TextExtractor = textExtractor; }
internal void Deconstruct(out byte fieldId, out ITextExtractor textExtractor, out ITokenizer tokenizer) { fieldId = this.Id; tokenizer = this.Tokenizer; textExtractor = this.TextExtractor; }
private static void ShowFileParsingInfo(ITextExtractor textExtractor, FileInfo[] files) { PrintFilesList("Found the following files:", files); var timer = new Stopwatch(); timer.Start(); var parsableFiles = files.Where(f => textExtractor.IsParseable(f.FullName)).ToArray(); timer.Stop(); Console.WriteLine("{0}{0}Processed in {1}{0}{0}", Environment.NewLine, timer.Elapsed); PrintFilesList("Parsable files:", parsableFiles); var unparsableFiles = files.Where(f => textExtractor.IsParseable(f.FullName) == false).ToArray(); PrintFilesList("Unparsable files:", unparsableFiles); }
internal IndexedFieldDetails(byte id, ITextExtractor textExtractor, ITokenizer tokenizer) { this.Id = id; this.TextExtractor = textExtractor; this.Tokenizer = tokenizer; }