public static CorpusDocument GetDocument(string indexName, int indexDocumentId) { var doc = Searcher.Doc(indexDocumentId); var ret = new CorpusDocument { Id = doc.Get("Id"), Title = doc.Get("Title") }; ret.SetContent(doc.Get("Content"), CorpusDocument.ContentFormat.Html); return ret; }
//private static MarkdownSharp.Markdown MarkdownConverter //{ // get { return _markdownConverter ?? (_markdownConverter = new MarkdownSharp.Markdown()); } //} //private static MarkdownSharp.Markdown _markdownConverter; public static void Foo(CorpusDocument doc) { string redirectToTopic; var htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content, new { Name = doc.Title, TopicId = doc.Id }, true, false, out redirectToTopic); }
private void GotDocument(CorpusDocument doc) { if (doc == null) return; var word = string.Empty; var tokens = new List<HebMorph.Token>(); // Strip all HTML tags var strippedContent = Regex.Replace(doc.Content, @"</?[A-Z][A-Z0-9]*\b[^>]*>", " ", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase); // Remove wikipedia language referral tags strippedContent = Regex.Replace(strippedContent, @"\[\[([A-Z-]+?):(.+?):(.+?)\]\]", " ", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase); lemmatizer.SetStream(new System.IO.StringReader(strippedContent)); // The HebMorph lemmatizer will always return a token, unless an unrecognized Hebrew // word was hit, then an empty tokens array will be returned. while (lemmatizer.LemmatizeNextToken(out word, tokens) != 0) { // Invalid token if (string.IsNullOrEmpty(word) || word.Length <= 1) continue; // Unrecognized Hebrew word if (tokens.Count == 0) { var o = radix.Lookup(word); if (o != null) { o.Count++; } else { o = new CoverageData {Count = 1, FirstKnownLocation = doc.Id, KnownToHSpell = false}; radix.AddNode(word, o); } continue; } // Otherwise, the token is either in the dictionary already, or is not a Hebrew word. If we // are performing complete coverage computation, add it to the radix as well // If we are performing a coverage calculation if (ComputeCoverage) { // A non-Hebrew word if (tokens.Count == 1 && !(tokens[0] is HebMorph.HebrewToken)) continue; // Hebrew words with one lemma or more - store the word in the radix with a flag // signaling it was indeed found var o = radix.Lookup(word); if (o != null) { o.Count++; } else { o = new CoverageData {Count = 1, FirstKnownLocation = doc.Id, KnownToHSpell = true}; radix.AddNode(word, o); } } } }
public CorpusDocument GetDocument(string indexName, int indexDocumentId) { var searcher = GetSearcher(indexName); if (searcher == null) throw new ArgumentException("Index not found: " + indexName); var doc = searcher.Doc(indexDocumentId); var ret = new CorpusDocument { Id = doc.Get("Id"), Title = doc.Get("Title") }; ret.SetContent(doc.Get("Content"), CorpusDocument.ContentFormat.Html); return ret; }