Ejemplo n.º 1
0
 public static CorpusDocument GetDocument(string indexName, int indexDocumentId)
 {
     var doc = Searcher.Doc(indexDocumentId);
     var ret = new CorpusDocument
     {
         Id = doc.Get("Id"),
         Title = doc.Get("Title")
     };
     ret.SetContent(doc.Get("Content"), CorpusDocument.ContentFormat.Html);
     return ret;
 }
Ejemplo n.º 2
0
 //private static MarkdownSharp.Markdown MarkdownConverter
 //{
 //    get { return _markdownConverter ?? (_markdownConverter = new MarkdownSharp.Markdown()); }
 //}
 //private static MarkdownSharp.Markdown _markdownConverter;
 public static void Foo(CorpusDocument doc)
 {
     string redirectToTopic;
     var htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content, new { Name = doc.Title, TopicId = doc.Id },
                                                       true, false, out redirectToTopic);
 }
        private void GotDocument(CorpusDocument doc)
        {
            if (doc == null)
                return;

            var word = string.Empty;
            var tokens = new List<HebMorph.Token>();

            // Strip all HTML tags
            var strippedContent = Regex.Replace(doc.Content, @"</?[A-Z][A-Z0-9]*\b[^>]*>", " ",
                                                RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);

            // Remove wikipedia language referral tags
            strippedContent = Regex.Replace(strippedContent, @"\[\[([A-Z-]+?):(.+?):(.+?)\]\]", " ", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);

            lemmatizer.SetStream(new System.IO.StringReader(strippedContent));

            // The HebMorph lemmatizer will always return a token, unless an unrecognized Hebrew
            // word was hit, then an empty tokens array will be returned.
            while (lemmatizer.LemmatizeNextToken(out word, tokens) != 0)
            {
                // Invalid token
                if (string.IsNullOrEmpty(word) || word.Length <= 1)
                    continue;

                // Unrecognized Hebrew word
                if (tokens.Count == 0)
                {
                    var o = radix.Lookup(word);
                    if (o != null)
                    {
                        o.Count++;
                    }
                    else
                    {
                        o = new CoverageData {Count = 1, FirstKnownLocation = doc.Id, KnownToHSpell = false};
                        radix.AddNode(word, o);
                    }
                    continue;
                }

                // Otherwise, the token is either in the dictionary already, or is not a Hebrew word. If we
                // are performing complete coverage computation, add it to the radix as well

                // If we are performing a coverage calculation
                if (ComputeCoverage)
                {
                    // A non-Hebrew word
                    if (tokens.Count == 1 && !(tokens[0] is HebMorph.HebrewToken))
                        continue;

                    // Hebrew words with one lemma or more - store the word in the radix with a flag
                    // signaling it was indeed found
                    var o = radix.Lookup(word);
                    if (o != null)
                    {
                        o.Count++;
                    }
                    else
                    {
                        o = new CoverageData {Count = 1, FirstKnownLocation = doc.Id, KnownToHSpell = true};
                        radix.AddNode(word, o);
                    }
                }
            }
        }
Ejemplo n.º 4
0
		public CorpusDocument GetDocument(string indexName, int indexDocumentId)
		{
			var searcher = GetSearcher(indexName);
			if (searcher == null)
				throw new ArgumentException("Index not found: " + indexName);

			var doc = searcher.Doc(indexDocumentId);
			var ret = new CorpusDocument
			          	{
			          		Id = doc.Get("Id"),
			          		Title = doc.Get("Title")
			          	};
			ret.SetContent(doc.Get("Content"), CorpusDocument.ContentFormat.Html);
			return ret;
		}