/// <inheritdoc /> public override XPathNavigator this[string key] { get { IndexedDocument document; string file; // Look up the file corresponding to the key if (index.TryGetValue(key, out file)) { // Now look for that file in the cache if (!cache.TryGetValue(file, out document)) { // Not in the cache, so load it document = new IndexedDocument(this, file); // If the cache is full, remove a document if (cache.Count >= cacheSize) { cache.Remove(queue.Dequeue()); } // Add the new document to the cache cache.Add(file, document); queue.Enqueue(file); } return(document[key]); } return(null); } }
private IDictionary <string, double> GetScoreByPhrase(IndexedDocument indexedDocument) { var documentScores1 = new Dictionary <string, double>(); var documentScores2 = new Dictionary <string, double>(); var testDocumentTfVector = new Dictionary <string, long>(); var docVectors = new Dictionary <string, Dictionary <string, long> >(); foreach (var phrase in indexedDocument.Phrases.Where(ph => ph.Length >= 5)) { var frequencyVector = vectorTokenizer.GetTokenFrequencyVector(phrase); long denom1 = frequencyVector.Select(kv => kv.Value * kv.Value).Sum(); foreach (var term in frequencyVector.Keys) { var occurences = luceneService.GetOccurences(term); foreach (var document in occurences.GetDocuments()) { var documentTerms = occurences.GetTermsVectors(document); long denom2 = documentTerms.Select(kv => kv.Value * kv.Value).Sum(); foreach (var documentTerm in documentTerms) { if (frequencyVector.ContainsKey(documentTerm.Key)) { long f1 = frequencyVector[documentTerm.Key]; long f2 = documentTerm.Value; if ((scamConfig.Epsilon - (((double)f1 / f2) + ((double)f2 / f1))) > 0) { double delta = (double)(f1 * f2) / denom1; if (!documentScores1.ContainsKey(document)) { documentScores1.Add(document, delta); } else { documentScores1[document] += delta; } delta = (double)(f1 * f2) / denom2; if (!documentScores2.ContainsKey(document)) { documentScores2.Add(document, delta); } else { documentScores2[document] += delta; } } } } } } } return(NormalizeScores(documentScores1, documentScores2)); }
/// <summary> /// Extract and create a lucene document object from the document entitiy object. By default will not /// perform text extraction /// </summary> /// <param name="document">The entitiy document object</param> /// <param name="extractText">Boolean flag indicating if text extraction should occur </param> /// <returns><see cref="IndexedDocument"/> object extracted from the entitiy document object</returns> private IndexedDocument GetLuceneDocumentFromDocument(Document document, bool extractText = false) { var luceneDocument = new IndexedDocument(); luceneDocument.Title = document.Title; // we are going to set the id the same as on the uploaded document // so we can easiliy map the lucene results to documents in our system luceneDocument.Id = document.Id; if (extractText) { //get the file contents using the tika extractor var textExtractResult = _textExtractor.Extract(document.Path); luceneDocument.Contents = textExtractResult.Text; } else { luceneDocument.Contents = null; } // set the dates luceneDocument.DateIndexed = DateTime.Now; // set the date Created for the lucene document that corresponds to the Date the document // was uploaded in the system luceneDocument.DateCreated = document.DateUploaded; // return the lucene document return(luceneDocument); }
public IDictionary <string, double> GetScore(IndexedDocument document, DetectionStrategy detectionStrategy = DetectionStrategy.ByDocument) { switch (detectionStrategy) { case DetectionStrategy.ByDocument: return(GetScoreByDocument(document)); case DetectionStrategy.ByPhrase: return(GetScoreByPhrase(document)); } throw new NotImplementedException(); }
public static IEnumerable <Document> ToLuceneDocuments(this IndexedDocument indexedDocument) { var result = new List <Document>(); foreach (var phrase in indexedDocument.Phrases) { var document = new Document(); document.Add(new Field(IndexFieldNames.Text, phrase, CustomeFieldTypes.StoreTermVectorsField)); document.Add(new StringField(IndexFieldNames.File, indexedDocument.FileName, Field.Store.YES)); document.Add(new StringField(IndexFieldNames.Id, indexedDocument.Id.ToString(), Field.Store.YES)); result.Add(document); } return(result); }
public ActionResult View(ObjectId id) { if (id == null || id.Equals(ObjectId.Empty)) { return(RedirectToAction("Index")); } IndexedDocument document = documentRepository.GetById(id); if (document == null) { ViewBag.Message = "Document not found."; } else if (!HavePermissionsForDocument(document)) { document = null; ViewBag.Message = "You have no permissions to view this document."; } return(View(document)); }
public ActionResult Upload(DocumentModel document) { if (!ModelState.IsValid) { ModelState.AddModelError("", "File was not selected."); return(View(document)); } if (!document.HasValidExtension()) { ModelState.AddModelError("", "Document's extension is not supported."); return(View(document)); } var user = User as User; var indexedDocument = new IndexedDocument(document, user, documentIndexator); documentRepository.Save(indexedDocument); return(RedirectToAction("Index")); }
/// <inheritdoc /> public override XPathNavigator this[string key] { get { IndexedDocument document; string file; // Look up the file corresponding to the key if(index.TryGetValue(key, out file)) { // Now look for that file in the cache if(!cache.TryGetValue(file, out document)) { // Not in the cache, so load it document = new IndexedDocument(this, file); // If the cache is full, remove a document if(cache.Count >= cacheSize) { IndexedDocument cacheDoc; string cacheFile; if(queue.TryDequeue(out cacheFile)) cache.TryRemove(cacheFile, out cacheDoc); } // Add the new document to the cache cache.TryAdd(file, document); queue.Enqueue(file); } return document[key]; } return null; } }
private IDictionary <string, double> GetScoreByDocument(IndexedDocument indexedDocument) { var documentScores1 = new Dictionary <string, double>(); var documentScores2 = new Dictionary <string, double>(); var testDocumentTfVector = new Dictionary <string, long>(); var docVectors = new Dictionary <string, Dictionary <string, long> >(); var phrases = indexedDocument.Phrases.Where(ph => ph.Length >= 5); var testFeqVector = vectorTokenizer.GetTokenFrequencyVector(string.Join(' ', phrases)); var docToDocIds = new Dictionary <string, ISet <int> >(); foreach (var term in testFeqVector.Keys) { var occurences = luceneService.GetDocumentsWithMatches(term); foreach (var kv in occurences) { if (docToDocIds.ContainsKey(kv.Key)) { docToDocIds[kv.Key].UnionWith(kv.Value); } else { docToDocIds.Add(kv.Key, kv.Value); } } } var docTermVectors = luceneService.GetTermVectors(docToDocIds); var denom1 = testFeqVector.Select(kv => kv.Value * kv.Value).Sum(); foreach (var documentName in docTermVectors.Keys) { var documentTerms = docTermVectors[documentName]; var denom2 = documentTerms.Select(kv => kv.Value * kv.Value).Sum(); foreach (var documentTerm in documentTerms) { if (testFeqVector.ContainsKey(documentTerm.Key)) { long f1 = testFeqVector[documentTerm.Key]; long f2 = documentTerm.Value; if ((scamConfig.Epsilon - (((double)f1 / f2) + ((double)f2 / f1))) > 0) { double delta = (double)(f1 * f2) / denom1; if (!documentScores1.ContainsKey(documentName)) { documentScores1.Add(documentName, delta); } else { documentScores1[documentName] += delta; } delta = (double)(f1 * f2) / denom2; if (!documentScores2.ContainsKey(documentName)) { documentScores2.Add(documentName, delta); } else { documentScores2[documentName] += delta; } } } } } return(NormalizeScores(documentScores1, documentScores2)); }
public bool HavePermissionsForDocument(IndexedDocument document) { return(document.UserId.Equals((User as User).Id)); }
/// <summary> /// Run the application. /// </summary> /// <param name="commandLine">The command line arguments.</param> /// <returns>The error code for the application.</returns> private int Run(CommandLine commandLine) { Uri outputUri = new Uri(commandLine.OutputFolder); List<IndexedDocument> indexedDocs = new List<IndexedDocument>(); // Build up a list of directories to ignore when processing documents. var ignored = commandLine.Ignored.Select(dir => Path.Combine(commandLine.InputFolder, dir)); foreach (string documentPath in Directory.GetFiles(commandLine.InputFolder, "*.*", SearchOption.AllDirectories)) { // Skip processing if the document path is ignored. if (ignored.Any(str => documentPath.StartsWith(str, StringComparison.OrdinalIgnoreCase))) { break; } Document doc = Document.Create(documentPath, commandLine.InputFolder); string documentOutputPath = Path.Combine(commandLine.OutputFolder, doc.RelativeOutputPath); string content = doc.Content; List<string> defines = new List<string>(); defines.Add(String.Concat("content=", content)); // ensure "content" variable is first so it always wins. string layout; if (doc.Meta.TryGetValue("layout", out layout)) { string layoutContent; if (!this.TryLoadLayout(commandLine.LayoutsFolder, layout, out layoutContent)) { throw new ArgumentException(String.Format("Error could not find layout: {0} in the layout folder: {1} while processing document: {2}", layout, commandLine.LayoutsFolder, doc.RelativePath)); } content = layoutContent; // replace the content with the layout, hopefully the layout has "{{content}}" in it somewhere. } defines.AddRange(commandLine.Variables); // command-line variables trump document meta. defines.AddRange(doc.Meta.Select(meta => String.Concat(meta.Key, "=", meta.Value))); // document meta is last. content = SubstituteVariables(defines, content); content = DocCompiler.FixRelativePaths(content, new Uri(documentOutputPath), outputUri); var indexedDoc = new IndexedDocument(doc, commandLine.OutputFolder); indexedDocs.Add(indexedDoc); if (!indexedDoc.ChmIgnored) { Output(content, documentOutputPath); } } List<IndexedDocument> ordered = OrderIndexedDocuments(indexedDocs); // Useful context when debugging. //DumpIndex(rootDoc); //Console.WriteLine("------"); //DumpOrderedIndexedDocuments(ordered); if (!String.IsNullOrEmpty(commandLine.AppendMarkdownTableOfContentsFile)) { AppendMarkdownTableOfContents(ordered, commandLine.AppendMarkdownTableOfContentsFile, commandLine.IgnoreXsdSimpleTypeInTableOfContents); } if (!String.IsNullOrEmpty(commandLine.HtmlHelpProjectFile)) { GenerateHtmlHelpProject(ordered, commandLine.HtmlHelpProjectFile, commandLine.OutputFolder); } return 0; }
private static void TraverseIndexedDocuments(IndexedDocument doc, List<IndexedDocument> ordered) { ordered.Add(doc); foreach (var next in doc.Next) { TraverseIndexedDocuments(next, ordered); } }
public void Save(IndexedDocument document) { DocumetCollection.Insert(document); }