void CreateCorpus() { Console.WriteLine("Loading words by verse"); var factory = new CorpusDocumentFactory(); Corpus = factory.Create(GeneratedCorpusXmlFilePath); }
private double Inference( VariationalInferenceParameter varInference, CorpusDocument doc, LLNAModel model) { double oldLHood = 0, convergence = 0; varInference.UpdateLikelihoodBound(doc, model); do { varInference.IncrementIter(); varInference.OptimizeZeta(model); varInference.OptimizeLambda(model, doc); varInference.OptimizeZeta(model); varInference.OptimizeNu(model, doc); varInference.OptimizeZeta(model); varInference.OptimizePhi(model, doc); oldLHood = varInference.LHood; varInference.UpdateLikelihoodBound(doc, model); convergence = Math.Abs((oldLHood - varInference.LHood) / oldLHood); }while ((convergence > _parameters.VarConvergence) && (_parameters.VarMaxIter < 0 || varInference.NIter < _parameters.VarMaxIter)); if (convergence > _parameters.VarConvergence) { varInference.Converged = false; } else { varInference.Converged = true; } return(varInference.LHood); }
void CreateCorpus() { Logger.Debug("Loading Corpus"); var factory = new CorpusDocumentFactory(); Corpus = factory.Create(GeneratedCorpusXmlFilePath); }
public static CorpusDocument GetDocument(string indexName, int indexDocumentId) { var doc = Searcher.Doc(indexDocumentId); var ret = new CorpusDocument { Id = doc.Get("Id"), Title = doc.Get("Title") }; ret.SetContent(doc.Get("Content"), CorpusDocument.ContentFormat.Html); return(ret); }
public Document( QuranDocument quranDocument, HadithDocument hadithDocument, TafsirDocument tafsirDocument, WordsDocument rootWordsDocument, CorpusDocument corpusDocument, LexiconDocument lexiconDocument) { this.QuranDocument = quranDocument; this.HadithDocument = hadithDocument; this.TafsirDocument = tafsirDocument; this.RootWordsDocument = rootWordsDocument; this.CorpusDocument = corpusDocument; this.LexiconDocument = lexiconDocument; }
public static string AsHtml(this CorpusDocument doc) { switch (doc.Format) { //case CorpusDocument.ContentFormat.Markdown: // return MarkdownConverter.Transform(doc.Content); case CorpusDocument.ContentFormat.WikiMarkup: string redirectToTopic; var htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content, new { Name = doc.Title, TopicId = doc.Id }, true, false, out redirectToTopic); //string redirectToTopic; //var htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content, new {Name = doc.Title, TopicId = doc.Id}, // true, false, out redirectToTopic); // we currently do not support the notion of redirects if (htmlContent.StartsWith("Redirected to") || htmlContent.StartsWith("<ol><li>הפניה")) { return(string.Empty); } // make up for dumb <br> handling by the formatter int loc = 0, tmp = 0; var sb = new StringBuilder(htmlContent.Length); while ((tmp = htmlContent.IndexOf("<br /><br />", loc, System.StringComparison.Ordinal)) > 0) { sb.Append(htmlContent.Substring(loc, tmp - loc)); sb.Append("<br />"); tmp += "<br /><br />".Length; while (tmp + "<br />".Length < htmlContent.Length && "<br />".Equals(htmlContent.Substring(tmp, "<br />".Length))) { tmp += "<br />".Length; } loc = tmp; } sb.Append(htmlContent.Substring(loc, htmlContent.Length - loc)); return(sb.ToString().Trim()); } return(doc.Content); // either a fallback or it is already HTML }
//private static MarkdownSharp.Markdown MarkdownConverter //{ // get { return _markdownConverter ?? (_markdownConverter = new MarkdownSharp.Markdown()); } //} //private static MarkdownSharp.Markdown _markdownConverter; public static void Foo(CorpusDocument doc) { string redirectToTopic; var htmlContent = ScrewTurn.Wiki.Formatter.Format(doc.Title, doc.Content, new { Name = doc.Title, TopicId = doc.Id }, true, false, out redirectToTopic); }
/// <summary> /// Indexes the provided string /// </summary> /// <param name="currentText">The string to index</param> /// <param name="beginning">The beginning offset of the block</param> /// <param name="end">The end offset of the block</param> /// <param name="charCarryOver">Whether there was a Wiki topic carryover from previous block</param> /// <param name="lastBlock">True if this is the last block</param> /// <returns>The number of characters in the end of the string that match the header entry</returns> private int ProcessBlock(string currentText, long beginning, long end, int charCarryOver, bool lastBlock) { var firstRun = true; var topicStart = currentText.IndexOf("<title>", StringComparison.InvariantCultureIgnoreCase); var title = String.Empty; int titleEnd, idStart, idEnd, topicEnd = -1; long id; bool shouldBreak = false; while (topicStart >= 0 && !AbortReading) { titleEnd = -1; idStart = -1; idEnd = -1; topicEnd = -1; titleEnd = currentText.IndexOf("</title>", topicStart, StringComparison.InvariantCultureIgnoreCase); if (titleEnd < 0) break; title = currentText.Substring(topicStart + "<title>".Length, titleEnd - topicStart - "<title>".Length); title = System.Web.HttpUtility.HtmlDecode(title); // The title is stored HTML encoded idStart = currentText.IndexOf("<id>", titleEnd, StringComparison.InvariantCultureIgnoreCase); if (idStart < 0) break; idEnd = currentText.IndexOf("</id>", idStart, StringComparison.InvariantCultureIgnoreCase); if (idEnd < 0) break; id = Convert.ToInt64(currentText.Substring(idStart + "<id>".Length, idEnd - idStart - "<id>".Length)); topicEnd = currentText.IndexOf("</text>", idEnd, StringComparison.InvariantCultureIgnoreCase); if (topicEnd < 0) break; // Start creating the object for the tokenizing ThreadPool thread var begins = new long[1]; var ends = new long[1]; // Was there a carryover? if (firstRun) { // Did the <title> happen in the carryover area? if (charCarryOver > 0 && topicStart < charCarryOver) { if (previousBlockBeginning > -1 && previousBlockEnd > -1) { begins = new long[2]; ends = new long[2]; begins[1] = previousBlockBeginning; ends[1] = previousBlockEnd; } else { throw new Exception("A Wiki topic title carryover occurred, but no previous block has been stored"); } } } begins[0] = beginning; ends[0] = end; var prevTopicStart = topicStart; // Store the last successful title start position var nextTopicStart = currentText.IndexOf("<title>", topicStart + 1, StringComparison.InvariantCultureIgnoreCase); if (nextTopicStart >= 0) { topicStart = nextTopicStart; } else { shouldBreak = true; } firstRun = false; // skip meta-pages - pages with irrelevant or no content if (title.StartsWith("תבנית:") || title.StartsWith("עזרה:") || title.StartsWith("ויקיפדיה:") || title.StartsWith("קטגוריה:") || title.StartsWith("קובץ:") || title.StartsWith("פורטל:")) { if (shouldBreak) break; continue; } var contents = currentText.Substring(prevTopicStart, topicEnd - prevTopicStart + 7/* == "</text>".Length */); contents = GetContentSection(contents, id, title); // For some weird reason, the Niqqud character Dagesh is not being used directly in he-wiki but // through the use of special markup var strippedContent = contents.Replace("{{דגש}}", "\u05BC"); // Process document var doc = new CorpusDocument {Id = id.ToString(), Title = title}; doc.SetContent(strippedContent, CorpusDocument.ContentFormat.WikiMarkup); if (OnDocument != null) OnDocument(doc); if (shouldBreak) break; } // Now calculate how many characters we need to save for next block var charsToSave = 0; if (topicStart == -1) { if (!lastBlock) { throw new Exception("No topics were found in the block"); } } else { if (!lastBlock) { if (topicEnd == -1) { charsToSave = currentText.Length - topicStart; } else { if (topicStart < topicEnd) { charsToSave = currentText.Length - topicEnd - "</text>".Length; } else { charsToSave = currentText.Length - topicStart; } } } } previousBlockBeginning = beginning; previousBlockEnd = end; return charsToSave; }