Example #1
0
        internal IndexedFieldLookup(
            IEnumerable <IFieldReader> fieldReaders,
            ITextExtractor defaultTextExtractor,
            ITokenizer defaultTokenizer)
        {
            if (fieldReaders is null)
            {
                throw new ArgumentNullException(nameof(fieldReaders));
            }

            if (defaultTextExtractor is null)
            {
                throw new ArgumentNullException(nameof(defaultTextExtractor));
            }

            if (defaultTokenizer is null)
            {
                throw new ArgumentNullException(nameof(defaultTokenizer));
            }

            foreach (var field in fieldReaders)
            {
                this.RegisterField(field, defaultTextExtractor, defaultTokenizer);
            }
        }
Example #2
0
        public LuceneSearchService(ITextExtractor pdfTextExtractor)
        {
            _pdfTextExtractor = pdfTextExtractor ?? Locator.Current.GetService <ITextExtractor>();

            _indexPath     = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "index");
            _documentsPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "files");
        }
Example #3
0
        internal FullTextIndex(
            IndexOptions indexOptions,
            ConfiguredObjectTokenizationOptions <TKey> itemTokenizationOptions,
            IIndexNodeFactory indexNodeFactory,
            IQueryParser queryParser,
            IIndexScorerFactory scorer,
            ITextExtractor defaultTextExtractor,
            ITokenizer defaultTokenizer,
            Func <IIndexSnapshot <TKey>, Task>[]?indexModifiedActions)
        {
            this.indexNavigatorPool      = new IndexNavigatorPool(scorer);
            this.indexOptions            = indexOptions;
            this.itemTokenizationOptions = itemTokenizationOptions ?? throw new ArgumentNullException(nameof(itemTokenizationOptions));
            this.IndexNodeFactory        = indexNodeFactory ?? throw new ArgumentNullException(nameof(indexNodeFactory));
            this.queryParser             = queryParser ?? throw new ArgumentNullException(nameof(queryParser));
            this.DefaultTextExtractor    = defaultTextExtractor;
            this.DefaultTokenizer        = defaultTokenizer ?? throw new ArgumentNullException(nameof(defaultTokenizer));
            this.indexModifiedActions    = indexModifiedActions;
            this.idPool      = new IdPool <TKey>();
            this.FieldLookup = new IndexedFieldLookup(
                this.itemTokenizationOptions.GetAllConfiguredFields(),
                defaultTextExtractor,
                defaultTokenizer);

            this.Root = this.IndexNodeFactory.CreateRootNode();
        }
 public ImprovedTranslateCommandHandler(IEducationProfileDownloader downloader, ITextExtractor textExtractor, ITranslator translator, ITranslationPersister persister)
 {
     _downloader    = downloader ?? throw new ArgumentNullException(nameof(downloader));
     _textExtractor = textExtractor ?? throw new ArgumentNullException(nameof(textExtractor));
     _translator    = translator ?? throw new ArgumentNullException(nameof(translator));
     _persister     = persister ?? throw new ArgumentNullException(nameof(persister));
 }
Example #5
0
 public UploadImageCommand(ContinuityManager continuityManager, ITextExtractor textExtractor, IBitmapUtility bitmapUtility, IImageRatioValidator imageRatioValidator)
 {
     this.continuityManager   = continuityManager;
     this.textExtractor       = textExtractor;
     this.bitmapUtility       = bitmapUtility;
     this.imageRatioValidator = imageRatioValidator;
 }
Example #6
0
        public IEnumerable <string> GetResult(IndexerFile file)
        {
            IndexerResult indexerResult = new IndexerResult();

            _configuration.Splitter.SetResultPhase(true);

            if (!_textExtractors.ContainsKey(file.Extension))
            {
                string message = "No extractor is defined for file extension: " + file.Extension + ".";
                throw new Exception(message);
            }

            ITextExtractor textExtractor = _textExtractors[file.Extension];
            string         fileText      = File.ReadAllText(file.Path);

            foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType))
            {
                try
                {
                    IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file);
                    identifierSplitResult.Add(_configuration.Splitter.Split(identifier));
                    indexerResult.AddSplitResult(identifierSplitResult);
                }
                catch (Exception)
                {
                    continue;
                }
            }

            return(indexerResult.GetSplitResultList().SelectMany(x => x.Splits).Select(Filter).Where(x => !string.IsNullOrEmpty(x)));
        }
        private static string ParseFile(ITextExtractor textExtractor, string filename)
        {
            // Images: https://www.google.se/search?q=tiff+ifilter&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-US:official&client=firefox-a&channel=fflb#hl=sv&client=firefox-a&hs=Cis&tbo=d&rls=org.mozilla:en-US%3Aofficial&channel=fflb&sclient=psy-ab&q=image+ifilter&oq=image+ifilter&gs_l=serp.3..0i13l4.4693.4693.1.4936.1.1.0.0.0.0.55.55.1.1.0...0.0...1c.1.k9PU0a3S1m4&pbx=1&bav=on.2,or.r_gc.r_pw.r_qf.&bvm=bv.1355534169,d.bGE&fp=71792659602ba5ba&bpcl=40096503&biw=1680&bih=919
            // http://technet.microsoft.com/sv-se/library/dd834685.aspx
            // http://technet.microsoft.com/en-us/library/dd744701%28v=ws.10%29.aspx

            // var filename = Environment.CurrentDirectory + "\\test.pdf";
            // x64, working: http://www.adobe.com/support/downloads/thankyou.jsp?ftpID=4025&fileID=3941
            // http://www.adobe.com/support/downloads/detail.jsp?ftpID=2611
            // http://www.foxitsoftware.com/products/ifilter/

            // http://www.microsoft.com/en-us/download/details.aspx?id=3988
            // var filename = Environment.CurrentDirectory + "\\test.docx";    // http://www.microsoft.com/en-us/download/details.aspx?id=20109
            // http://www.microsoft.com/en-us/download/details.aspx?id=17062
            // var filename = Environment.CurrentDirectory + "\\test.txt";
            /*
            try
            {
                using (var stream = File.OpenRead(filename))
                {
                    return textExtractor.GetText(stream);
                }
            }
            catch(Exception exc)
            {
                Console.WriteLine("Exception was thrown while reading '" + filename + "'");
                Console.WriteLine("Exception: " + exc.Message);
                return "";
            }
            */
            return textExtractor.GetText(filename);
        }
Example #8
0
 public SearchService(IUserDao userDao, ICuyahogaContextProvider cuyahogaContextProvider, ITextExtractor textExtractor, IContentItemService<IContentItem> contentItemService)
 {
     this._userDao = userDao;
     this._cuyahogaContextProvider = cuyahogaContextProvider;
     this._textExtractor = textExtractor;
     this._contentItemService = contentItemService;
 }
Example #9
0
 public SearchService(IUserDao userDao, ICuyahogaContextProvider cuyahogaContextProvider, ITextExtractor textExtractor, IContentItemService <IContentItem> contentItemService)
 {
     this._userDao = userDao;
     this._cuyahogaContextProvider = cuyahogaContextProvider;
     this._textExtractor           = textExtractor;
     this._contentItemService      = contentItemService;
 }
Example #10
0
        private Document BuildDocumentFromContentItem(IContentItem contentItem, ITextExtractor textExtractor)
        {
            ISearchableContent searchInfo = contentItem as ISearchableContent;

            if (searchInfo == null)
            {
                throw new ArgumentException("Argument must implement ISearchableContent");
            }

            // Get the text of the content item to index
            string contentToIndex = searchInfo.ToSearchContent(textExtractor);
            // strip (x)html tags
            string plainTextContent = System.Text.RegularExpressions.Regex.Replace(contentToIndex, @"<(.|\n)*?>", string.Empty);
            // create the actual url
            string path = contentItem.GetContentUrl();
            // check that summary is not null.
            string summary = contentItem.Summary ?? Text.TruncateText(plainTextContent, 200);

            Document doc = new Document();

            doc.Add(new Field("globalid", contentItem.GlobalId.ToString("N"), Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("title", contentItem.Title, Field.Store.YES, Field.Index.TOKENIZED));
            doc.Add(new Field("summary", summary, Field.Store.YES, Field.Index.TOKENIZED));
            doc.Add(new Field("contents", plainTextContent, Field.Store.NO, Field.Index.TOKENIZED));
            doc.Add(new Field("author", contentItem.CreatedBy.FullName, Field.Store.YES, Field.Index.TOKENIZED));
            doc.Add(new Field("moduletype", contentItem.Section.ModuleType.Name, Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("site", contentItem.Section.Node.Site.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("datecreated", contentItem.CreatedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("datemodified", contentItem.ModifiedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED));
            if (contentItem.PublishedAt.HasValue)
            {
                doc.Add(new Field("datepublished", contentItem.PublishedAt.Value.ToString("u"), Field.Store.YES,
                                  Field.Index.UN_TOKENIZED));
            }
            // do not index the sectionid here (since it's used for access filtering)
            doc.Add(new Field("sectionid", contentItem.Section.Id.ToString(), Field.Store.YES, Field.Index.NO));

            foreach (Category cat in contentItem.Categories)
            {
                doc.Add(new Field("category", cat.Name, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }

            foreach (Role viewRole in contentItem.ViewRoles)
            {
                doc.Add(new Field("viewroleid", viewRole.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED));
            }

            foreach (CustomSearchField field in searchInfo.GetCustomSearchFields())
            {
                Field.Store store = field.IsStored ? Field.Store.YES : Field.Store.NO;
                Field.Index index = field.IsTokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED;
                if (field.FieldKey != null && field.FieldValue != null)
                {
                    doc.Add(new Field(field.FieldKey, field.FieldValue, store, index));
                }
            }
            return(doc);
        }
 public FileResourceService(IFileService fileService, ICuyahogaContextProvider cuyahogaContextProvider, IContentItemService<FileResource> contentItemService, ICommonDao commonDao, ITextExtractor textExtractor)
 {
     this._commonDao = commonDao;
     this._fileService = fileService;
     this._contentItemService = contentItemService;
     this._cuyahogaContextProvider = cuyahogaContextProvider;
     this._textExtractor = textExtractor;
 }
Example #12
0
 public FileResourceService(IFileService fileService, ICuyahogaContextProvider cuyahogaContextProvider, IContentItemService <FileResource> contentItemService, ICommonDao commonDao, ITextExtractor textExtractor)
 {
     this._commonDao               = commonDao;
     this._fileService             = fileService;
     this._contentItemService      = contentItemService;
     this._cuyahogaContextProvider = cuyahogaContextProvider;
     this._textExtractor           = textExtractor;
 }
Example #13
0
        /// <summary>
        /// Default constructor.
        /// </summary>
        /// <param name="physicalIndexDir">Location of the index files.</param>
        /// <param name="rebuildIndex">Flag to indicate if the index should be rebuilt. 
        /// <param name="textExtractor">The text extractor that can be used to extract text from content.</param>
        public IndexBuilder(string physicalIndexDir, bool rebuildIndex, ITextExtractor textExtractor)
        {
            this._indexDirectory = FSDirectory.GetDirectory(physicalIndexDir, false);
            this._rebuildIndex = rebuildIndex;
            this._textExtractor = textExtractor;

            InitIndexWriter();

            log.Info("IndexBuilder created.");
        }
Example #14
0
        /// <summary>
        /// Default constructor.
        /// </summary>
        /// <param name="physicalIndexDir">Location of the index files.</param>
        /// <param name="rebuildIndex">Flag to indicate if the index should be rebuilt.
        /// <param name="textExtractor">The text extractor that can be used to extract text from content.</param>
        public IndexBuilder(string physicalIndexDir, bool rebuildIndex, ITextExtractor textExtractor)
        {
            this._indexDirectory = FSDirectory.GetDirectory(physicalIndexDir, false);
            this._rebuildIndex   = rebuildIndex;
            this._textExtractor  = textExtractor;

            InitIndexWriter();

            log.Info("IndexBuilder created.");
        }
Example #15
0
        public async Task HandleAsync_uses_translator_to_translate_text([Frozen] ITextExtractor extractor, [Frozen] ITranslator translator, ImprovedTranslateCommandHandler sut, IDispatcher dispatcher, ICommandContext <TranslateEducationCommand> context, string[] paragraphs)
        {
            Mock.Get(extractor).Setup(p => p.ExtractText(It.IsAny <string>())).Returns(paragraphs);

            await sut.HandleAsync(dispatcher, context);

            foreach (var text in paragraphs)
            {
                Mock.Get(translator).Verify(p => p.TranslateText(text, context.Command.ToLanguage));
            }
        }
        public async Task HandleAsync_uses_extractor_to_extract_paragraphs([Frozen] IEducationProfileDownloader downloader,
                                                                           [Frozen] ITextExtractor extractor, ImprovedTranslateCommandHandler sut, IDispatcher dispatcher,
                                                                           ICommandContext <TranslateEducationCommand> context,
                                                                           string content)
        {
            Mock.Get(downloader).Setup(p => p.GetProfile(It.IsAny <int>())).ReturnsAsync(content);

            await sut.HandleAsync(dispatcher, context);

            Mock.Get(extractor).Verify(p => p.ExtractText(content));
        }
Example #17
0
        public IndexedDocument(DocumentModel uploadedDocument, User user, DocumentIndexator documentIndexator)
        {
            var uploadedFile = uploadedDocument.File;

            FileName    = Path.GetFileNameWithoutExtension(uploadedFile.FileName);
            CreatedDate = DateTime.Now;
            UserId      = user.Id;

            ITextExtractor textExtractor = TextExtractorFactory.GetTextExtractor(uploadedDocument.DocumentExtension);

            Content       = textExtractor.ExtractText(uploadedDocument);
            WordFrequency = documentIndexator.ExtractWordFrequency(Content + ' ' + FileName);
        }
Example #18
0
        /// <summary>
        /// Updates dictionary
        /// </summary>
        private void UpdateTokenDictionary()
        {
            _configuration.Splitter.SetResultPhase(false);

            // first create a dictionary for tokens
            // extracts all text from source code
            int totalFileCount   = _configuration.FilesToScan.Count;
            int currentFileCount = 0;

            foreach (IndexerFile file in _configuration.FilesToScan)
            {
                try
                {
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.AnalyzingFile, currentFileCount, totalFileCount, "Extracting file: " + file.Name);

                    if (!_textExtractors.ContainsKey(file.Extension))
                    {
                        string message = "No extractor is defined for file extension: " + file.Extension + ".  Do you want to skip this file?";
                        if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message))
                        {
                            continue;
                        }
                    }

                    ITextExtractor textExtractor = _textExtractors[file.Extension];
                    string         fileText      = File.ReadAllText(file.Path);
                    foreach (string identifier in textExtractor.Extract(fileText))
                    {
                        _configuration.NotificationHandler.UpdateStatus(NotificationType.IdentifyingToken, currentFileCount, totalFileCount, "Analyzing token: " + identifier + " in file: " + file.Name);
                        _configuration.Splitter.UpdateTokenDictionary(identifier);
                    }
                }
                catch (Exception e)
                {
                    string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name + Environment.NewLine + "Message: " + e.Message + Environment.NewLine + "Do you want to skip this file?";
                    if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage))
                    {
                        throw;
                    }
                }
                finally
                {
                    currentFileCount++;
                }
            }
        }
Example #19
0
 public PaseLista(Grupo grupo)
 {
     this.grupo               = grupo;
     this.textExtractor       = new DefaultTextExtractor(1);
     this.repositorioAlumno   = new RepositorioAlumno();
     this.repositorioSesiones = new RepositorioSesiones();
     this.tableLabels         = new List <Label>();
     this.tableButtons        = new List <Button>();
     this.alumnosPresentes    = new Dictionary <string, Alumno>();
     this.alumnosAusentes     = new Dictionary <string, Alumno>();
     this.labelMap            = new Dictionary <string, Label>();
     this.nameTokensMap       = new Dictionary <string, string[]>();
     this.labelWidth          = 780;
     this.labelHeight         = 24;
     InitializeComponent();
     CargarAlumnos();
     PintarAlumnos();
     textExtractor.StartWorking();
 }
Example #20
0
        private static void ParseFiles(ITextExtractor textExtractor, IEnumerable<FileInfo> files)
        {
            var parsableFiles = files.Where(f => textExtractor.IsParseable(f.FullName)).ToArray();

            var timer = new Stopwatch();
            timer.Start();

            foreach (var fileInfo in parsableFiles)
            {
                Console.WriteLine("{0}{0}---------------------------------------", Environment.NewLine);
                Console.WriteLine("Parsing file: {0}{1}", fileInfo.Name, Environment.NewLine);

                var result = ParseFile(textExtractor, fileInfo.FullName);
                Console.WriteLine("Parsed result is: ");
                Console.Write(result);
            }

            timer.Stop();
            Console.WriteLine("{0}{0}Processed in {1}", Environment.NewLine, timer.Elapsed);
        }
Example #21
0
        private void RegisterField(IFieldReader fieldOptions, ITextExtractor defaultTextExtractor, ITokenizer defaultTokenizer)
        {
            var fieldName = fieldOptions.Name;

            if (this.fieldToDetailsLookup.ContainsKey(fieldOptions.Name))
            {
                throw new LiftiException(ExceptionMessages.FieldNameAlreadyUsed, fieldName);
            }

            var newId = Interlocked.Increment(ref nextId);

            if (newId > byte.MaxValue)
            {
                throw new LiftiException(ExceptionMessages.MaximumDistinctFieldsIndexReached);
            }

            var id             = (byte)newId;
            var fieldTokenizer = fieldOptions.Tokenizer ?? defaultTokenizer;
            var textExtractor  = fieldOptions.TextExtractor ?? defaultTextExtractor;

            this.fieldToDetailsLookup[fieldName] = new IndexedFieldDetails((byte)id, textExtractor, fieldTokenizer);
            this.idToFieldLookup[id]             = fieldName;
        }
Example #22
0
 public JoinableList(ITextExtractor textExtractor)
     : this(new ArrayList(), m_DefaultDelimiter, textExtractor)
 {
 }
Example #23
0
 public JoinableList(string delimiter, ITextExtractor textExtractor)
     : this(new ArrayList(), delimiter, textExtractor)
 {
 }
Example #24
0
        /// <summary>
        /// Get Result
        /// </summary>
        /// <returns>Indexer Result</returns>
        private IndexerResult GetResult()
        {
            IndexerResult indexerResult = new IndexerResult();

            _configuration.Splitter.SetResultPhase(true);

            // extract
            int totalFileCount   = _configuration.FilesToScan.Count;
            int currentFileCount = 0;

            foreach (IndexerFile file in _configuration.FilesToScan)
            {
                try
                {
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.ReadingFileForIdentifiers, currentFileCount, totalFileCount, "Extracting file identifier: " + file.Name);

                    if (!_textExtractors.ContainsKey(file.Extension))
                    {
                        string message = "No extractor is defined for file extension: " + file.Extension + ".  Do you want to skip this file?";
                        if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message))
                        {
                            continue;
                        }
                    }

                    ITextExtractor textExtractor = _textExtractors[file.Extension];
                    string         fileText      = File.ReadAllText(file.Path);
                    foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType))
                    {
                        _configuration.NotificationHandler.UpdateStatus(NotificationType.Splitting, currentFileCount, totalFileCount, "Splitting token: " + identifier + " in file: " + file.Name);
                        IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file);
                        identifierSplitResult.Add(_configuration.Splitter.Split(identifier));
                        indexerResult.AddSplitResult(identifierSplitResult);
                    }
                }
                catch (Exception e)
                {
                    string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name +
                                               Environment.NewLine + "Message: " + e.Message + Environment.NewLine +
                                               "Do you want to skip this file?";
                    if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage))
                    {
                        throw;
                    }
                }
                finally
                {
                    currentFileCount++;
                }
            }

            // Since while adding result we do not have merged token, misspelled and stemmed word info, filter them and add to respective list
            indexerResult.UpdateFromMergeToken(_tokenDictionary);
            indexerResult.UpdateFromMisspelled(_tokenDictionary);
            indexerResult.UpdateFromStemmed(_tokenDictionary);

            // Filter 3: Stem every identified. If the word is identified replace the word with stemmed word
            if (_configuration.Stemmer != null)
            {
                List <string> dictionaryWordList     = indexerResult.GetDictionaryWordList().Keys.ToList();
                int           totalIdentifiedCount   = dictionaryWordList.Count;
                int           currentIdentifiedCount = 0;
                foreach (string identified in dictionaryWordList)
                {
                    currentIdentifiedCount++;
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.Stemming, currentIdentifiedCount, totalIdentifiedCount, "Stemming: " + identified);
                    string stemmedText = _configuration.Stemmer.GetStemmedText(identified);
                    if (stemmedText != null && stemmedText != identified && _configuration.Dictionary.IsWord(stemmedText))
                    {
                        indexerResult.AddStemmedWordAndReplaceIdentified(identified, stemmedText);
                    }
                }
            }

            // Filter result
            indexerResult.RemoveFilterWordAndTokenResult(_configuration.Dictionary);

            _configuration.NotificationHandler.UpdateStatus(NotificationType.IndexingCompleted, 1, 1, "Indexing Completed");
            return(indexerResult);
        }
Example #25
0
 public DssIndexService(ILuceneIndexService <IndexedDocument> luceneIndexService, ITextExtractor textExtractor, IRepository <Document> documentsRepository)
 {
     _luceneIndexService  = luceneIndexService;
     _textExtractor       = textExtractor;
     _documentsRepository = documentsRepository;
 }
Example #26
0
 /// <summary>
 /// Get the full contents of this ContentItem for indexing
 /// </summary>
 /// <returns></returns>
 public virtual string ToSearchContent(ITextExtractor textExtractor)
 {
     return this._content;
 }
Example #27
0
        private void formatOkay_Click(object sender, RoutedEventArgs e)
        {
            if (baseTree == null)
            {
                MessageBox.Show("Please select a content tree for the data tree.");
                return;
            }

            if (formatBox.SelectedIndex == -1)
            {
                formatBox.BorderBrush = Brushes.Red;
                return;
            }
            if (string.IsNullOrEmpty(documentFormatBox.Text))
            {
                documentFormatBox.BorderBrush = Brushes.Red;
                return;
            }

            OpenFileDialog ofd = new OpenFileDialog();

            ofd.FileName   = "Tree";
            ofd.DefaultExt = ".txt";

            Nullable <bool> result = ofd.ShowDialog();

            if (result == true)
            {
                string filename = ofd.FileName;
                documentLabel.Content = filename + "datatrees";
                using (Ookii.Dialogs.Wpf.ProgressDialog dial = new ProgressDialog()) {
                    dial.ProgressBarStyle = ProgressBarStyle.MarqueeProgressBar;
                    dial.Show();
                    dial.Description = "Analyzing text...";
                    IIO            io = new FileIO();
                    ITextExtractor it = null;
                    switch (formatBox.SelectedIndex)
                    {
                    case 0:
                        string text = io.ReadSource(filename);
                        it = new XMLTextExtractor(text, documentFormatBox.Text);
                        break;

                    case 1:
                        var texts = io.ReadSourceIterable(filename);
                        it = new BeginMarkerExtraction(texts, documentFormatBox.Text);
                        break;

                    default:
                        throw new InvalidOperationException();
                    }
                    documents = new ObservableCollection <string>();
                    while (it.HasNextContent())
                    {
                        string content = it.FindNextContent();
                        string name    = Helpers.GetNameWhenFirst(content);
                        documents.Add(name);

                        IDataTree tree = DataTreeBuilder.CreateDocumentMappedTree(baseTree);
                        DataTreeBuilder.AddToDataTree(tree, content);

                        ITreeIO tio = new TreeIO();
                        tio.SaveDataTree(tree, filename + @"datatrees\" + name + ".dtree");
                    }
                    documentList.ItemsSource = documents;
                }
            }


            buildDataTreePopup.IsOpen = false;
        }
Example #28
0
        public static string GetExtract(Stream stream, string fileName, out string errorMessage)
        {
            if (stream == null)
            {
                errorMessage = null;
                return(String.Empty);
            }
            if (stream.Length == 0)
            {
                errorMessage = null;
                return(String.Empty);
            }
            if (String.IsNullOrEmpty(fileName))
            {
                errorMessage = "Cannot resolve a TextExtractor if FileName is null or empty";
                return(String.Empty);
            }
            var extension = Path.GetExtension(fileName);

            if (String.IsNullOrEmpty(extension))
            {
                errorMessage = "Cannot resolve a TextExtractor if FileName's extension is null or empty";
                return(string.Empty);
            }
            extension = extension.TrimStart('.');
            if (extension.Length == 0)
            {
                errorMessage = "Cannot resolve a TextExtractor if FileName's extension is empty";
                return(string.Empty);
            }
            extension = extension.ToLower();
            if (extension == "txt")
            {
                errorMessage = null;
                return(SenseNet.ContentRepository.Tools.GetStreamString(stream));
            }

            ITextExtractor extractor = null;
            var            result    = string.Empty;

            switch (extension)
            {
            case "contenttype":
            case "xml": extractor = new XmlTextExtractor(); break;

            case "doc": extractor = new DocTextExtractor(); break;

            case "xls": extractor = new XlsTextExtractor(); break;

            case "pdf": extractor = new PdfTextExtractor(); break;

            case "docx": extractor = new DocxTextExtractor(); break;

            case "xlsx": extractor = new XlsxTextExtractor(); break;

            case "pptx": extractor = new PptxTextExtractor(); break;

            case "txt": extractor = new PlainTextExtractor(); break;

            default:
                errorMessage = String.Format("Cannot resolve a TextExtractor for this extension: '{0}'", extension);
                return(String.Empty);
            }

            try
            {
                //-- sync
                result       = extractor.Extract(stream);
                errorMessage = null;

                ////-- async

                /*
                 * Action<TimeboxedActivity> timeboxedFunctionCall = activity =>
                 * {
                 *  var x = (Stream)activity.InArgument;
                 *  var extract = extractor.Extract(x);
                 *  activity.OutArgument = extract;
                 * };
                 *
                 * var act = new TimeboxedActivity();
                 * act.InArgument = stream;
                 * act.Activity = timeboxedFunctionCall;
                 *
                 * var finishedWithinTime = act.ExecuteAndWait(5000);
                 * if (!finishedWithinTime)
                 * {
                 *  act.Abort();
                 *  errorMessage = String.Format("Text extracting timeout. path: {0}", fileName);
                 *  return String.Empty;
                 * }
                 * else if (act.ExecutionException != null)
                 * {
                 *  errorMessage = String.Format("An error occured during extracting text. Path: {0}. Message: {1}", fileName, act.ExecutionException.Message);
                 * }
                 * else
                 * {
                 *  result = (string)act.OutArgument;
                 *  errorMessage = null;
                 * }
                 */
            }
            catch (Exception e)
            {
                errorMessage = String.Format("An error occured during extracting text. Path: {0}. Message: {1}", fileName, e.Message);
            }

            if (String.IsNullOrEmpty(result))
            {
                var format = @"Couldn't extract text. FileName: '{0}' ";
                errorMessage = String.Format(CultureInfo.InvariantCulture, format, fileName);
            }

            result = result.Replace('\0', '.');
            return(result);
        }
Example #29
0
        private Document BuildDocumentFromContentItem(IContentItem contentItem, ITextExtractor textExtractor)
        {
            ISearchableContent searchInfo = contentItem as ISearchableContent;
            if (searchInfo == null) throw new ArgumentException("Argument must implement ISearchableContent");

            // Get the text of the content item to index
            string contentToIndex = searchInfo.ToSearchContent(textExtractor);
            // strip (x)html tags
            string plainTextContent = System.Text.RegularExpressions.Regex.Replace(contentToIndex, @"<(.|\n)*?>", string.Empty);
            // create the actual url
            string path = contentItem.GetContentUrl();
            // check that summary is not null.
            string summary = contentItem.Summary ?? Text.TruncateText(plainTextContent, 200);

            Document doc = new Document();
            doc.Add(new Field("globalid", contentItem.GlobalId.ToString("N"), Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("title", contentItem.Title, Field.Store.YES, Field.Index.TOKENIZED));
            doc.Add(new Field("summary", summary, Field.Store.YES, Field.Index.TOKENIZED));
            doc.Add(new Field("contents", plainTextContent, Field.Store.NO, Field.Index.TOKENIZED));
            doc.Add(new Field("author", contentItem.CreatedBy.FullName, Field.Store.YES, Field.Index.TOKENIZED));
            doc.Add(new Field("moduletype", contentItem.Section.ModuleType.Name, Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("site", contentItem.Section.Node.Site.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("datecreated", contentItem.CreatedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED));
            doc.Add(new Field("datemodified", contentItem.ModifiedAt.ToString("u"), Field.Store.YES, Field.Index.UN_TOKENIZED));
            if (contentItem.PublishedAt.HasValue)
            {
                doc.Add(new Field("datepublished", contentItem.PublishedAt.Value.ToString("u"), Field.Store.YES,
                                  Field.Index.UN_TOKENIZED));
            }
            // do not index the sectionid here (since it's used for access filtering)
            doc.Add(new Field("sectionid", contentItem.Section.Id.ToString(), Field.Store.YES, Field.Index.NO));

            foreach (Category cat in contentItem.Categories)
            {
                doc.Add(new Field("category", cat.Name, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }

            foreach (Role viewRole in contentItem.ViewRoles)
            {
                doc.Add(new Field("viewroleid", viewRole.Id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED));
            }

            foreach (CustomSearchField field in searchInfo.GetCustomSearchFields())
            {
                Field.Store store = field.IsStored ? Field.Store.YES : Field.Store.NO;
                Field.Index index = field.IsTokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED;
                if (field.FieldKey != null && field.FieldValue != null)
                {
                    doc.Add(new Field(field.FieldKey, field.FieldValue, store, index));
                }
            }
            return doc;
        }
Example #30
0
 /// <summary>
 /// Get the full contents of the ContentItem for indexing.
 /// </summary>
 /// <returns></returns>
 /// <remarks>
 ///
 /// </remarks>
 public virtual string ToSearchContent(ITextExtractor textExtractor)
 {
     return(textExtractor.ExtractTextFromFile(this._physicalFilePath));
 }
Example #31
0
 public virtual string ToSearchContent(ITextExtractor textExtractor)
 {
     return(this._content);
 }
Example #32
0
 /// <summary>
 /// Configures the index to use a text extraction process when indexing text. This is useful when
 /// source text contains markup, e,g. for XML/HTML you can use the <see cref="XmlTextExtractor"/>.
 /// </summary>
 public FullTextIndexBuilder <TKey> WithTextExtractor(ITextExtractor textExtractor)
 {
     this.defaultTextExtractor = textExtractor;
     return(this);
 }
Example #33
0
 public FileActions(ITextExtractor textExtractor)
 {
     _textExtractor = textExtractor;
 }
 /// <summary>
 /// Get the full contents of the ContentItem for indexing.
 /// </summary>
 /// <returns></returns>
 /// <remarks>
 /// 
 /// </remarks>
 public virtual string ToSearchContent(ITextExtractor textExtractor)
 {
     return textExtractor.ExtractTextFromFile(this._physicalFilePath);
 }
Example #35
0
 public JoinableList(IList list, ITextExtractor textExtractor)
     : this(list, m_DefaultDelimiter, textExtractor)
 {
 }
Example #36
0
        public static string GetExtract(BinaryData binaryData, Node node)
        {
            if (binaryData == null)
            {
                return(string.Empty);
            }
            var fname = binaryData.FileName;

            if (fname == null)
            {
                return(string.Empty);
            }
            var ext = fname.Extension;

            if (String.IsNullOrEmpty(ext))
            {
                return(string.Empty);
            }

            ITextExtractor extractor = null;
            var            result    = string.Empty;

            switch (ext.ToLower())
            {
            case "contenttype":
            case "xml": extractor = new XmlTextExtractor(); break;

            case "doc": extractor = new DocTextExtractor(); break;

            case "xls": extractor = new XlsTextExtractor(); break;

            case "pdf": extractor = new PdfTextExtractor(); break;

            case "docx": extractor = new DocxTextExtractor(); break;

            case "xlsx": extractor = new XlsxTextExtractor(); break;

            case "pptx": extractor = new PptxTextExtractor(); break;

            case "txt": extractor = new PlainTextExtractor(); break;

            default:
                return(String.Empty);
            }

            var stream = binaryData.GetStream();

            if (stream == null)
            {
                return(String.Empty);
            }
            if (stream.Length == 0)
            {
                return(String.Empty);
            }

            try
            {
                ////-- sync
                //result = extractor.Extract(stream);

                //-- async
                Action <TimeboxedActivity> timeboxedFunctionCall = activity =>
                {
                    var x       = (Stream)activity.InArgument;
                    var extract = extractor.Extract(x);
                    activity.OutArgument = extract;
                };

                var act = new TimeboxedActivity();
                act.InArgument = stream;
                act.Activity   = timeboxedFunctionCall;
                act.Context    = HttpContext.Current;

                var finishedWithinTime = act.ExecuteAndWait(Repository.TextExtractTimeout * 1000);
                if (!finishedWithinTime)
                {
                    act.Abort();
                    var msg = String.Format("Text extracting timeout. Version: {0}, path: {1}", node.Version, node.Path);
                    Logger.WriteWarning(msg);
                    return(String.Empty);
                }
                else if (act.ExecutionException != null)
                {
                    WriteError(act.ExecutionException, node);
                }
                else
                {
                    result = (string)act.OutArgument;
                }
            }
            catch (Exception e)
            {
                WriteError(e, node);
            }

            if (String.IsNullOrEmpty(result))
            {
                var format = @"Couldn't extract text. VersionId: {0}, path: '{1}' ";
                var inf    = String.Format(CultureInfo.InvariantCulture, format, node.VersionId, node.Path);
                Logger.WriteWarning(inf);
            }

            result = result.Replace('\0', '.');
            return(result);
        }
Example #37
0
 public JoinableList(IList list, string delimiter, ITextExtractor textExtractor)
     : base(list)
 {
     m_Delimiter     = delimiter;
     m_TextExtractor = textExtractor;
 }
Example #38
0
 internal void Deconstruct(out byte fieldId, out ITextExtractor textExtractor, out ITokenizer tokenizer)
 {
     fieldId       = this.Id;
     tokenizer     = this.Tokenizer;
     textExtractor = this.TextExtractor;
 }
Example #39
0
        private static void ShowFileParsingInfo(ITextExtractor textExtractor, FileInfo[] files)
        {
            PrintFilesList("Found the following files:", files);

            var timer = new Stopwatch();
            timer.Start();
            var parsableFiles = files.Where(f => textExtractor.IsParseable(f.FullName)).ToArray();
            timer.Stop();
            Console.WriteLine("{0}{0}Processed in {1}{0}{0}", Environment.NewLine, timer.Elapsed);

            PrintFilesList("Parsable files:", parsableFiles);

            var unparsableFiles = files.Where(f => textExtractor.IsParseable(f.FullName) == false).ToArray();
            PrintFilesList("Unparsable files:", unparsableFiles);
        }
Example #40
0
 internal IndexedFieldDetails(byte id, ITextExtractor textExtractor, ITokenizer tokenizer)
 {
     this.Id            = id;
     this.TextExtractor = textExtractor;
     this.Tokenizer     = tokenizer;
 }