Пример #1
0
        /// <inheritdoc />
        public override XPathNavigator this[string key]
        {
            get
            {
                IndexedDocument document;
                string          file;

                // Look up the file corresponding to the key
                if (index.TryGetValue(key, out file))
                {
                    // Now look for that file in the cache
                    if (!cache.TryGetValue(file, out document))
                    {
                        // Not in the cache, so load it
                        document = new IndexedDocument(this, file);

                        // If the cache is full, remove a document
                        if (cache.Count >= cacheSize)
                        {
                            cache.Remove(queue.Dequeue());
                        }

                        // Add the new document to the cache
                        cache.Add(file, document);
                        queue.Enqueue(file);
                    }

                    return(document[key]);
                }

                return(null);
            }
        }
        private IDictionary <string, double> GetScoreByPhrase(IndexedDocument indexedDocument)
        {
            var documentScores1 = new Dictionary <string, double>();
            var documentScores2 = new Dictionary <string, double>();

            var testDocumentTfVector = new Dictionary <string, long>();
            var docVectors           = new Dictionary <string, Dictionary <string, long> >();

            foreach (var phrase in indexedDocument.Phrases.Where(ph => ph.Length >= 5))
            {
                var frequencyVector = vectorTokenizer.GetTokenFrequencyVector(phrase);

                long denom1 = frequencyVector.Select(kv => kv.Value * kv.Value).Sum();

                foreach (var term in frequencyVector.Keys)
                {
                    var occurences = luceneService.GetOccurences(term);
                    foreach (var document in occurences.GetDocuments())
                    {
                        var  documentTerms = occurences.GetTermsVectors(document);
                        long denom2        = documentTerms.Select(kv => kv.Value * kv.Value).Sum();
                        foreach (var documentTerm in documentTerms)
                        {
                            if (frequencyVector.ContainsKey(documentTerm.Key))
                            {
                                long f1 = frequencyVector[documentTerm.Key];
                                long f2 = documentTerm.Value;

                                if ((scamConfig.Epsilon - (((double)f1 / f2) + ((double)f2 / f1))) > 0)
                                {
                                    double delta = (double)(f1 * f2) / denom1;

                                    if (!documentScores1.ContainsKey(document))
                                    {
                                        documentScores1.Add(document, delta);
                                    }
                                    else
                                    {
                                        documentScores1[document] += delta;
                                    }

                                    delta = (double)(f1 * f2) / denom2;

                                    if (!documentScores2.ContainsKey(document))
                                    {
                                        documentScores2.Add(document, delta);
                                    }
                                    else
                                    {
                                        documentScores2[document] += delta;
                                    }
                                }
                            }
                        }
                    }
                }
            }

            return(NormalizeScores(documentScores1, documentScores2));
        }
Пример #3
0
        /// <summary>
        /// Extract and create a lucene document object from the document entitiy object. By default will not
        /// perform text extraction
        /// </summary>
        /// <param name="document">The entitiy document object</param>
        /// <param name="extractText">Boolean flag indicating if text extraction should occur </param>
        /// <returns><see cref="IndexedDocument"/> object extracted from the entitiy document object</returns>
        private IndexedDocument GetLuceneDocumentFromDocument(Document document, bool extractText = false)
        {
            var luceneDocument = new IndexedDocument();

            luceneDocument.Title = document.Title;

            // we are going to set the id the same as on the uploaded document
            // so we can easiliy map the lucene results to documents in our system
            luceneDocument.Id = document.Id;

            if (extractText)
            {
                //get the file contents using the tika extractor
                var textExtractResult = _textExtractor.Extract(document.Path);

                luceneDocument.Contents = textExtractResult.Text;
            }
            else
            {
                luceneDocument.Contents = null;
            }

            // set the dates
            luceneDocument.DateIndexed = DateTime.Now;

            // set the date Created for the lucene document that corresponds to the Date the document
            // was uploaded in the system
            luceneDocument.DateCreated = document.DateUploaded;

            // return the lucene document
            return(luceneDocument);
        }
        public IDictionary <string, double> GetScore(IndexedDocument document, DetectionStrategy detectionStrategy = DetectionStrategy.ByDocument)
        {
            switch (detectionStrategy)
            {
            case DetectionStrategy.ByDocument:
                return(GetScoreByDocument(document));

            case DetectionStrategy.ByPhrase:
                return(GetScoreByPhrase(document));
            }

            throw new NotImplementedException();
        }
        public static IEnumerable <Document> ToLuceneDocuments(this IndexedDocument indexedDocument)
        {
            var result = new List <Document>();

            foreach (var phrase in indexedDocument.Phrases)
            {
                var document = new Document();
                document.Add(new Field(IndexFieldNames.Text, phrase, CustomeFieldTypes.StoreTermVectorsField));
                document.Add(new StringField(IndexFieldNames.File, indexedDocument.FileName, Field.Store.YES));
                document.Add(new StringField(IndexFieldNames.Id, indexedDocument.Id.ToString(), Field.Store.YES));
                result.Add(document);
            }

            return(result);
        }
        public ActionResult View(ObjectId id)
        {
            if (id == null || id.Equals(ObjectId.Empty))
            {
                return(RedirectToAction("Index"));
            }

            IndexedDocument document = documentRepository.GetById(id);

            if (document == null)
            {
                ViewBag.Message = "Document not found.";
            }
            else if (!HavePermissionsForDocument(document))
            {
                document        = null;
                ViewBag.Message = "You have no permissions to view this document.";
            }

            return(View(document));
        }
        public ActionResult Upload(DocumentModel document)
        {
            if (!ModelState.IsValid)
            {
                ModelState.AddModelError("", "File was not selected.");
                return(View(document));
            }

            if (!document.HasValidExtension())
            {
                ModelState.AddModelError("", "Document's extension is not supported.");
                return(View(document));
            }

            var user            = User as User;
            var indexedDocument = new IndexedDocument(document, user, documentIndexator);

            documentRepository.Save(indexedDocument);

            return(RedirectToAction("Index"));
        }
Пример #8
0
        /// <inheritdoc />
        public override XPathNavigator this[string key]
        {
            get
            {
                IndexedDocument document;
                string file;

                // Look up the file corresponding to the key
                if(index.TryGetValue(key, out file))
                {
                    // Now look for that file in the cache
                    if(!cache.TryGetValue(file, out document))
                    {
                        // Not in the cache, so load it
                        document = new IndexedDocument(this, file);

                        // If the cache is full, remove a document
                        if(cache.Count >= cacheSize)
                        {
                            IndexedDocument cacheDoc;
                            string cacheFile;

                            if(queue.TryDequeue(out cacheFile))
                                cache.TryRemove(cacheFile, out cacheDoc);
                        }

                        // Add the new document to the cache
                        cache.TryAdd(file, document);
                        queue.Enqueue(file);
                    }

                    return document[key];
                }

                return null;
            }
        }
        private IDictionary <string, double> GetScoreByDocument(IndexedDocument indexedDocument)
        {
            var documentScores1 = new Dictionary <string, double>();
            var documentScores2 = new Dictionary <string, double>();

            var testDocumentTfVector = new Dictionary <string, long>();
            var docVectors           = new Dictionary <string, Dictionary <string, long> >();

            var phrases       = indexedDocument.Phrases.Where(ph => ph.Length >= 5);
            var testFeqVector = vectorTokenizer.GetTokenFrequencyVector(string.Join(' ', phrases));

            var docToDocIds = new Dictionary <string, ISet <int> >();

            foreach (var term in testFeqVector.Keys)
            {
                var occurences = luceneService.GetDocumentsWithMatches(term);
                foreach (var kv in occurences)
                {
                    if (docToDocIds.ContainsKey(kv.Key))
                    {
                        docToDocIds[kv.Key].UnionWith(kv.Value);
                    }
                    else
                    {
                        docToDocIds.Add(kv.Key, kv.Value);
                    }
                }
            }

            var docTermVectors = luceneService.GetTermVectors(docToDocIds);
            var denom1         = testFeqVector.Select(kv => kv.Value * kv.Value).Sum();

            foreach (var documentName in docTermVectors.Keys)
            {
                var documentTerms = docTermVectors[documentName];
                var denom2        = documentTerms.Select(kv => kv.Value * kv.Value).Sum();

                foreach (var documentTerm in documentTerms)
                {
                    if (testFeqVector.ContainsKey(documentTerm.Key))
                    {
                        long f1 = testFeqVector[documentTerm.Key];
                        long f2 = documentTerm.Value;

                        if ((scamConfig.Epsilon - (((double)f1 / f2) + ((double)f2 / f1))) > 0)
                        {
                            double delta = (double)(f1 * f2) / denom1;

                            if (!documentScores1.ContainsKey(documentName))
                            {
                                documentScores1.Add(documentName, delta);
                            }
                            else
                            {
                                documentScores1[documentName] += delta;
                            }

                            delta = (double)(f1 * f2) / denom2;

                            if (!documentScores2.ContainsKey(documentName))
                            {
                                documentScores2.Add(documentName, delta);
                            }
                            else
                            {
                                documentScores2[documentName] += delta;
                            }
                        }
                    }
                }
            }

            return(NormalizeScores(documentScores1, documentScores2));
        }
Пример #10
0
 public bool HavePermissionsForDocument(IndexedDocument document)
 {
     return(document.UserId.Equals((User as User).Id));
 }
Пример #11
0
        /// <summary>
        /// Run the application.
        /// </summary>
        /// <param name="commandLine">The command line arguments.</param>
        /// <returns>The error code for the application.</returns>
        private int Run(CommandLine commandLine)
        {
            Uri outputUri = new Uri(commandLine.OutputFolder);
            List<IndexedDocument> indexedDocs = new List<IndexedDocument>();

            // Build up a list of directories to ignore when processing documents.
            var ignored = commandLine.Ignored.Select(dir => Path.Combine(commandLine.InputFolder, dir));

            foreach (string documentPath in Directory.GetFiles(commandLine.InputFolder, "*.*", SearchOption.AllDirectories))
            {
                // Skip processing if the document path is ignored.
                if (ignored.Any(str => documentPath.StartsWith(str, StringComparison.OrdinalIgnoreCase)))
                {
                    break;
                }

                Document doc = Document.Create(documentPath, commandLine.InputFolder);
                string documentOutputPath = Path.Combine(commandLine.OutputFolder, doc.RelativeOutputPath);
                string content = doc.Content;

                List<string> defines = new List<string>();
                defines.Add(String.Concat("content=", content)); // ensure "content" variable is first so it always wins.

                string layout;
                if (doc.Meta.TryGetValue("layout", out layout))
                {
                    string layoutContent;
                    if (!this.TryLoadLayout(commandLine.LayoutsFolder, layout, out layoutContent))
                    {
                        throw new ArgumentException(String.Format("Error could not find layout: {0} in the layout folder: {1} while processing document: {2}", layout, commandLine.LayoutsFolder, doc.RelativePath));
                    }

                    content = layoutContent; // replace the content with the layout, hopefully the layout has "{{content}}" in it somewhere.
                }

                defines.AddRange(commandLine.Variables); // command-line variables trump document meta.
                defines.AddRange(doc.Meta.Select(meta => String.Concat(meta.Key, "=", meta.Value))); // document meta is last.

                content = SubstituteVariables(defines, content);

                content = DocCompiler.FixRelativePaths(content, new Uri(documentOutputPath), outputUri);

                var indexedDoc = new IndexedDocument(doc, commandLine.OutputFolder);
                indexedDocs.Add(indexedDoc);

                if (!indexedDoc.ChmIgnored)
                {
                    Output(content, documentOutputPath);
                }
            }

            List<IndexedDocument> ordered = OrderIndexedDocuments(indexedDocs);
            // Useful context when debugging.
            //DumpIndex(rootDoc);
            //Console.WriteLine("------");
            //DumpOrderedIndexedDocuments(ordered);

            if (!String.IsNullOrEmpty(commandLine.AppendMarkdownTableOfContentsFile))
            {
                AppendMarkdownTableOfContents(ordered, commandLine.AppendMarkdownTableOfContentsFile, commandLine.IgnoreXsdSimpleTypeInTableOfContents);
            }

            if (!String.IsNullOrEmpty(commandLine.HtmlHelpProjectFile))
            {
                GenerateHtmlHelpProject(ordered, commandLine.HtmlHelpProjectFile, commandLine.OutputFolder);
            }

            return 0;
        }
Пример #12
0
 private static void TraverseIndexedDocuments(IndexedDocument doc, List<IndexedDocument> ordered)
 {
     ordered.Add(doc);
     foreach (var next in doc.Next)
     {
         TraverseIndexedDocuments(next, ordered);
     }
 }
Пример #13
0
 public void Save(IndexedDocument document)
 {
     DocumetCollection.Insert(document);
 }