Ejemplo n.º 1
0
        void CreateCorpus()
        {
            Console.WriteLine("Loading words by verse");
            var factory = new CorpusDocumentFactory();

            Corpus = factory.Create(GeneratedCorpusXmlFilePath);
        }
Ejemplo n.º 2
0
        private double Inference(
            VariationalInferenceParameter varInference,
            CorpusDocument doc,
            LLNAModel model)
        {
            double oldLHood = 0, convergence = 0;

            varInference.UpdateLikelihoodBound(doc, model);
            do
            {
                varInference.IncrementIter();
                varInference.OptimizeZeta(model);
                varInference.OptimizeLambda(model, doc);
                varInference.OptimizeZeta(model);
                varInference.OptimizeNu(model, doc);
                varInference.OptimizeZeta(model);
                varInference.OptimizePhi(model, doc);
                oldLHood = varInference.LHood;
                varInference.UpdateLikelihoodBound(doc, model);
                convergence = Math.Abs((oldLHood - varInference.LHood) / oldLHood);
            }while ((convergence > _parameters.VarConvergence) && (_parameters.VarMaxIter < 0 || varInference.NIter < _parameters.VarMaxIter));

            if (convergence > _parameters.VarConvergence)
            {
                varInference.Converged = false;
            }
            else
            {
                varInference.Converged = true;
            }

            return(varInference.LHood);
        }
Ejemplo n.º 3
0
        void CreateCorpus()
        {
            Logger.Debug("Loading Corpus");
            var factory = new CorpusDocumentFactory();

            Corpus = factory.Create(GeneratedCorpusXmlFilePath);
        }
Ejemplo n.º 4
0
        public static CorpusDocument GetDocument(string indexName, int indexDocumentId)
        {
            var doc = Searcher.Doc(indexDocumentId);
            var ret = new CorpusDocument
            {
                Id    = doc.Get("Id"),
                Title = doc.Get("Title")
            };

            ret.SetContent(doc.Get("Content"), CorpusDocument.ContentFormat.Html);
            return(ret);
        }
Ejemplo n.º 5
0
 public Document(
     QuranDocument quranDocument,
     HadithDocument hadithDocument,
     TafsirDocument tafsirDocument,
     WordsDocument rootWordsDocument,
     CorpusDocument corpusDocument,
     LexiconDocument lexiconDocument)
 {
     this.QuranDocument     = quranDocument;
     this.HadithDocument    = hadithDocument;
     this.TafsirDocument    = tafsirDocument;
     this.RootWordsDocument = rootWordsDocument;
     this.CorpusDocument    = corpusDocument;
     this.LexiconDocument   = lexiconDocument;
 }
Ejemplo n.º 6
0
        public static string AsHtml(this CorpusDocument doc)
        {
            switch (doc.Format)
            {
            //case CorpusDocument.ContentFormat.Markdown:
            //    return MarkdownConverter.Transform(doc.Content);
            case CorpusDocument.ContentFormat.WikiMarkup:
                string redirectToTopic;
                var    htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content,
                                                                     new
                {
                    Name    = doc.Title,
                    TopicId = doc.Id
                },
                                                                     true, false,
                                                                     out redirectToTopic);

                //string redirectToTopic;
                //var htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content, new {Name = doc.Title, TopicId = doc.Id},
                //                                                  true, false, out redirectToTopic);

                // we currently do not support the notion of redirects
                if (htmlContent.StartsWith("Redirected to") || htmlContent.StartsWith("<ol><li>הפניה"))
                {
                    return(string.Empty);
                }

                // make up for dumb <br> handling by the formatter
                int loc = 0, tmp = 0;
                var sb = new StringBuilder(htmlContent.Length);
                while ((tmp = htmlContent.IndexOf("<br /><br />", loc, System.StringComparison.Ordinal)) > 0)
                {
                    sb.Append(htmlContent.Substring(loc, tmp - loc));
                    sb.Append("<br />");
                    tmp += "<br /><br />".Length;
                    while (tmp + "<br />".Length < htmlContent.Length && "<br />".Equals(htmlContent.Substring(tmp, "<br />".Length)))
                    {
                        tmp += "<br />".Length;
                    }
                    loc = tmp;
                }
                sb.Append(htmlContent.Substring(loc, htmlContent.Length - loc));

                return(sb.ToString().Trim());
            }
            return(doc.Content);            // either a fallback or it is already HTML
        }
Ejemplo n.º 7
0
        //private static MarkdownSharp.Markdown MarkdownConverter
        //{
        //    get { return _markdownConverter ?? (_markdownConverter = new MarkdownSharp.Markdown()); }
        //}
        //private static MarkdownSharp.Markdown _markdownConverter;

        public static void Foo(CorpusDocument doc)
        {
            string redirectToTopic;
            var    htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content, new { Name = doc.Title, TopicId = doc.Id },
                                                                 true, false, out redirectToTopic);
        }
        /// <summary>
        /// Indexes the provided string
        /// </summary>
        /// <param name="currentText">The string to index</param>
        /// <param name="beginning">The beginning offset of the block</param>
        /// <param name="end">The end offset of the block</param>
        /// <param name="charCarryOver">Whether there was a Wiki topic carryover from previous block</param>
        /// <param name="lastBlock">True if this is the last block</param>
        /// <returns>The number of characters in the end of the string that match the header entry</returns>
        private int ProcessBlock(string currentText, long beginning, long end, int charCarryOver, bool lastBlock)
        {
            var firstRun = true;
            var topicStart = currentText.IndexOf("<title>", StringComparison.InvariantCultureIgnoreCase);
			var title = String.Empty;

            int titleEnd, idStart, idEnd, topicEnd = -1;
            long id;
        	bool shouldBreak = false;

            while (topicStart >= 0 && !AbortReading)
            {
                titleEnd = -1;
                idStart = -1;
                idEnd = -1;
                topicEnd = -1;

                titleEnd = currentText.IndexOf("</title>", topicStart, StringComparison.InvariantCultureIgnoreCase);

                if (titleEnd < 0)
                    break;

                title = currentText.Substring(topicStart + "<title>".Length, titleEnd - topicStart - "<title>".Length);
            	title = System.Web.HttpUtility.HtmlDecode(title); // The title is stored HTML encoded

                idStart = currentText.IndexOf("<id>", titleEnd, StringComparison.InvariantCultureIgnoreCase);
                if (idStart < 0)
                    break;

                idEnd = currentText.IndexOf("</id>", idStart, StringComparison.InvariantCultureIgnoreCase);
                if (idEnd < 0)
                    break;

                id = Convert.ToInt64(currentText.Substring(idStart + "<id>".Length, idEnd - idStart - "<id>".Length));
                
                topicEnd = currentText.IndexOf("</text>", idEnd, StringComparison.InvariantCultureIgnoreCase);
                if (topicEnd < 0)
                    break;

                // Start creating the object for the tokenizing ThreadPool thread
                var begins = new long[1];
                var ends = new long[1];

                // Was there a carryover?
                if (firstRun)
                {
                    // Did the <title> happen in the carryover area?
                    if (charCarryOver > 0 &&
                        topicStart < charCarryOver)
                    {
                        if (previousBlockBeginning > -1 &&
                            previousBlockEnd > -1)
                        {
                            begins = new long[2];
                            ends = new long[2];

                            begins[1] = previousBlockBeginning;
                            ends[1] = previousBlockEnd;
                        }
                        else
                        {
                            throw new Exception("A Wiki topic title carryover occurred, but no previous block has been stored");
                        }
                    }
                }

                begins[0] = beginning;
                ends[0] = end;

            	var prevTopicStart = topicStart;
				// Store the last successful title start position
				var nextTopicStart = currentText.IndexOf("<title>", topicStart + 1, StringComparison.InvariantCultureIgnoreCase);
				if (nextTopicStart >= 0)
				{
					topicStart = nextTopicStart;
				}
				else
				{
					shouldBreak = true;
				}

				firstRun = false;

				// skip meta-pages - pages with irrelevant or no content
				if (title.StartsWith("תבנית:") || title.StartsWith("עזרה:") || title.StartsWith("ויקיפדיה:")
					|| title.StartsWith("קטגוריה:") || title.StartsWith("קובץ:") || title.StartsWith("פורטל:"))
				{
					if (shouldBreak) break;
					continue;
				}

            	var contents = currentText.Substring(prevTopicStart, topicEnd - prevTopicStart + 7/* == "</text>".Length */);
                contents = GetContentSection(contents, id, title);

				// For some weird reason, the Niqqud character Dagesh is not being used directly in he-wiki but
				// through the use of special markup
				var strippedContent = contents.Replace("{{דגש}}", "\u05BC");
                
                // Process document
            	var doc = new CorpusDocument {Id = id.ToString(), Title = title};
            	doc.SetContent(strippedContent, CorpusDocument.ContentFormat.WikiMarkup);
				if (OnDocument != null) OnDocument(doc);

				if (shouldBreak) break;
            }

            // Now calculate how many characters we need to save for next block
            var charsToSave = 0;
            if (topicStart == -1)
            {
                if (!lastBlock)
                {
                    throw new Exception("No topics were found in the block");
                }
            }
            else
            {
                if (!lastBlock)
                {
                    if (topicEnd == -1)
                    {
                        charsToSave = currentText.Length - topicStart;
                    }
                    else
                    {
                        if (topicStart < topicEnd)
                        {
                            charsToSave = currentText.Length - topicEnd - "</text>".Length;
                        }
                        else
                        {
                            charsToSave = currentText.Length - topicStart;
                        }
                    }
                }
            }

            previousBlockBeginning = beginning;
            previousBlockEnd = end;

            return charsToSave;
        }