public void OneSegPerVerse() { string pc1 = "Das buch ist rot. "; string verse1 = "9"; string pc2 = "Der Herr ist gross."; string verse2 = "10"; string pc3 = "Ich spreche nicht viel Deutsch."; ITsStrBldr bldr = m_tsf.MakeString(pc1 + verse1 + pc2 + verse2 + pc3, m_wsVern).GetBldr(); bldr.SetStrPropValue(pc1.Length, pc1.Length + verse1.Length, (int)FwTextPropType.ktptNamedStyle, ScrStyleNames.VerseNumber); int ichV2 = pc1.Length + verse1.Length + pc2.Length; bldr.SetStrPropValue(ichV2, ichV2 + verse2.Length, (int)FwTextPropType.ktptNamedStyle, ScrStyleNames.VerseNumber); m_para.Contents.UnderlyingTsString = bldr.GetString(); ParagraphParser pp = new ParagraphParser(m_para); List<int> eosIndexes; List<int> segments = pp.CollectSegmentAnnotations(m_para.Contents.UnderlyingTsString, out eosIndexes); Cache.VwCacheDaAccessor.CacheVecProp(m_para.Hvo, ktagParaSegments, segments.ToArray(), segments.Count); Assert.AreEqual(5, segments.Count); Assert.AreEqual("", AnnotationRefHandler.VerseSegLabel(m_para, 0, ktagParaSegments)); Assert.AreEqual("", AnnotationRefHandler.VerseSegLabel(m_para, 2, ktagParaSegments)); Assert.AreEqual("", AnnotationRefHandler.VerseSegLabel(m_para, 4, ktagParaSegments)); }
public void SegmentBreaks() { ITsStrFactory tsf = TsStrFactoryClass.Create(); ParagraphParser pp = new ParagraphParser(m_para); string test1 = "This is a simple sentence"; ITsString tss = tsf.MakeString(test1, 1); m_para.Contents.UnderlyingTsString = tss; List<int> results; List<int> segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[0], results, "no punct string"); Assert.AreEqual(1, segments.Count); VerifySegment(segments[0], 0, test1.Length, m_para.Hvo, "no punct string"); // Empty string. ITsString tssEmpty = tsf.MakeString("", 1); m_para.Contents.UnderlyingTsString = tssEmpty; segments = pp.CollectSegmentAnnotations(tssEmpty, out results); VerifyBreaks(new int[0], results, "empty string"); Assert.AreEqual(0, segments.Count); //String with multiple segments. string test2 = "This is a more complex sentence (ending with a 'quote')."; string test3 = " 2 "; string test4 = "This is the second sentence."; ITsString tssMulti = tsf.MakeString(test2 + test3 + test4, 1); m_para.Contents.UnderlyingTsString = tssMulti; segments = pp.CollectSegmentAnnotations(tssMulti, out results); VerifyBreaks(new int[] {test2.Length - 1, test2.Length + test3.Length + test4.Length - 1}, results, "multi-sentence string"); Assert.AreEqual(2, segments.Count); // The segments end and begin at the '2' in test3. VerifySegment(segments[0], 0, test2.Length + 2, m_para.Hvo, "first seg of multi-sentence"); VerifySegment(segments[1], test2.Length + 2, test2.Length + test3.Length + test4.Length, m_para.Hvo, "second seg of multi-sentence"); // String with embedded verse/chapter numbers (and implementation). ITsStrBldr bldr = tssMulti.GetBldr(); bldr.SetStrPropValue(test2.Length + 2, test2.Length + 3, (int) FwTextPropType.ktptNamedStyle, ScrStyleNames.VerseNumber); ITsString tssMultiV = bldr.GetString(); m_para.Contents.UnderlyingTsString = tssMultiV; segments = pp.CollectSegmentAnnotations(tssMultiV, out results); VerifyBreaks(new int[] { test2.Length - 1, test2.Length + 4, test2.Length + test3.Length + test4.Length - 1 }, results, "multi-sentence string with verse"); Assert.AreEqual(3, segments.Count); // The segments end and begin at the '2' in test3. VerifySegment(segments[0], 0, test2.Length + 2, m_para.Hvo, "first seg of multi-sentence w. verse"); VerifySegment(segments[1], test2.Length + 2, test2.Length + 4, m_para.Hvo, "second seg of multi-sentence w. verse"); VerifySegment(segments[2], test2.Length + 4, test2.Length + test3.Length + test4.Length, m_para.Hvo, "third seg of multi-sentence w. verse"); string test6 = "13 1 "; string test7 = "121"; ITsString tssStartFinish = tsf.MakeString(test6 + test2 + test7, 1); bldr = tssStartFinish.GetBldr(); bldr.SetStrPropValue(0, 2, (int)FwTextPropType.ktptNamedStyle, ScrStyleNames.ChapterNumber); bldr.SetStrPropValue(3, test6.Length, (int)FwTextPropType.ktptNamedStyle, ScrStyleNames.VerseNumber); bldr.SetStrPropValue(test6.Length + test2.Length, tssStartFinish.Length, (int)FwTextPropType.ktptNamedStyle, ScrStyleNames.VerseNumber); tssStartFinish = bldr.GetString(); m_para.Contents.UnderlyingTsString = tssStartFinish; segments = pp.CollectSegmentAnnotations(tssStartFinish, out results); VerifyBreaks(new int[] { test6.Length, test6.Length + test2.Length - 1 }, results, "start/finish breaks"); Assert.AreEqual(3, segments.Count); // The segments end and begin at the '2' in test3. VerifySegment(segments[0], 0, test6.Length, m_para.Hvo, "first seg of start/finish"); VerifySegment(segments[1], test6.Length , test6.Length + test2.Length, m_para.Hvo, "second seg of start/finish"); VerifySegment(segments[2], test6.Length + test2.Length, tssStartFinish.Length, m_para.Hvo, "third seg of start/finish"); // However, anything non-white between two label-style runs separates them. Change the space between the // two runs to something that's neither an EOS nor a letter. bldr = tssStartFinish.GetBldr(); bldr.ReplaceTsString(2,3, tsf.MakeString(":",1)); ITsString tssSplitLabelRuns = bldr.GetString(); m_para.Contents.UnderlyingTsString = tssSplitLabelRuns; segments = pp.CollectSegmentAnnotations(tssSplitLabelRuns, out results); VerifyBreaks(new int[] { 2, 3, test6.Length, test6.Length + test2.Length - 1 }, results, "broken pair breaks"); Assert.AreEqual(5, segments.Count); // The segments end and begin at the '2' in test3. VerifySegment(segments[0], 0, 2, m_para.Hvo, "first seg of broken pair"); VerifySegment(segments[1], 2, 3, m_para.Hvo, "2nd seg of start/finish"); VerifySegment(segments[2], 3, test6.Length, m_para.Hvo, "3rd seg of start/finish"); VerifySegment(segments[3], test6.Length, test6.Length + test2.Length, m_para.Hvo, "second seg of start/finish"); VerifySegment(segments[4], test6.Length + test2.Length, tssStartFinish.Length, m_para.Hvo, "third seg of start/finish"); // Check that we get the correct breaks when the material before a label segment doesn't have an EOS. string test8 = "This text has no EOS "; ITsString tssMultiNoEos = tsf.MakeString(test8 + test3 + test4, 1); bldr = tssMultiNoEos.GetBldr(); bldr.SetStrPropValue(test8.Length + 2, test8.Length + 3, (int)FwTextPropType.ktptNamedStyle, ScrStyleNames.VerseNumber); tssMultiNoEos = bldr.GetString(); m_para.Contents.UnderlyingTsString = tssMultiNoEos; segments = pp.CollectSegmentAnnotations(tssMultiNoEos, out results); VerifyBreaks(new int[] { test8.Length + 2, test8.Length + 4, test8.Length + test3.Length + test4.Length - 1 }, results, "no EOS before label"); Assert.AreEqual(3, segments.Count); // The segments end and begin at the '2' in test3. VerifySegment(segments[0], 0, test8.Length + 2, m_para.Hvo, "first seg ofno EOS before label"); VerifySegment(segments[1], test8.Length + 2, test8.Length + 4, m_para.Hvo, "second seg of no EOS before label"); VerifySegment(segments[2], test8.Length + 4, test8.Length + test3.Length + test4.Length, m_para.Hvo, "third seg of no EOS before label"); }
public void HardLineBreaks() { ITsStrFactory tsf = TsStrFactoryClass.Create(); ParagraphParser pp = new ParagraphParser(m_para); // String with embedded ORC. string test1 = "This is a simple sentence"; string lineBreak = "\x2028"; string test3 = "with a hard break."; ITsString tss = tsf.MakeString(test1 + lineBreak + test3, 1); m_para.Contents.UnderlyingTsString = tss; List<int> results; List<int> segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { test1.Length, test1.Length + 1, tss.Length - 1 }, results, "simple string with hard break"); Assert.AreEqual(3, segments.Count); // The segments break around the ORC. VerifySegment(segments[0], 0, test1.Length, m_para.Hvo, "simple string with hard break"); VerifySegment(segments[1], test1.Length, test1.Length + 1, m_para.Hvo, "simple string with hard break"); VerifySegment(segments[2], test1.Length + 1, tss.Length, m_para.Hvo, "simple string with hard break"); // Now try with an EOS before the hard break. string test1a = "This is a proper sentence?!"; tss = tsf.MakeString(test1a + lineBreak + test3, 1); m_para.Contents.UnderlyingTsString = tss; segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { test1a.Length - 2, test1a.Length + 1, tss.Length - 1 }, results, "EOS before hard break"); Assert.AreEqual(3, segments.Count); // The segments break around the ORC. VerifySegment(segments[0], 0, test1a.Length, m_para.Hvo, "EOS before hard break"); VerifySegment(segments[1], test1a.Length, test1a.Length + 1, m_para.Hvo, "EOS before hard break"); VerifySegment(segments[2], test1a.Length + 1, tss.Length, m_para.Hvo, "EOS before hard break"); }
public void OrcIsLabel() { ITsStrFactory tsf = TsStrFactoryClass.Create(); ParagraphParser pp = new ParagraphParser(m_para); // String with embedded ORC. string test1 = "This is a simple sentence"; string test2 = "\xfffc"; string test3 = " with a footnote."; ITsString tss = tsf.MakeString(test1 + test2 + test3, 1); // To be recognized an ORC must have unique properties. ITsStrBldr bldr = tss.GetBldr(); bldr.SetStrPropValue(test1.Length, test1.Length + test2.Length, (int)FwTextPropType.ktptObjData, "nonsence"); tss = bldr.GetString(); m_para.Contents.UnderlyingTsString = tss; List<int> results; List<int> segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { test1.Length, test1.Length + test2.Length + 1, test1.Length + test2.Length + test3.Length - 1 }, results, "multi-sentence string with ORC"); Assert.AreEqual(3, segments.Count); // The segments break around the ORC. VerifySegment(segments[0], 0, test1.Length, m_para.Hvo, "first seg of multi-sentence w. ORC"); VerifySegment(segments[1], test1.Length, test1.Length + test2.Length + 1, m_para.Hvo, "second seg of multi-sentence w. ORC"); VerifySegment(segments[2], test1.Length + test2.Length + 1, test1.Length + test2.Length + test3.Length, m_para.Hvo, "third seg of multi-sentence w. ORC"); }
public void LeadingPunctuation() { ITsStrFactory tsf = TsStrFactoryClass.Create(); ParagraphParser pp = new ParagraphParser(m_para); string test1 = "?This is a question with special punctuation?"; ITsString tss = tsf.MakeString(test1, 1); m_para.Contents.UnderlyingTsString = tss; List<int> results; List<int> segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { test1.Length - 1 }, results, "leading QM"); Assert.AreEqual(1, segments.Count); VerifySegment(segments[0], 0, test1.Length, m_para.Hvo, "leading QM"); // Now try leading punctuation following a verse number. ITsStrBldr bldr = tss.GetBldr(); string verse = "5 "; bldr.Replace(0, 0, verse, null); bldr.SetStrPropValue(0, 1, (int)FwTextPropType.ktptNamedStyle, ScrStyleNames.VerseNumber); ITsString tssMultiV = bldr.GetString(); m_para.Contents.UnderlyingTsString = tssMultiV; segments = pp.CollectSegmentAnnotations(tssMultiV, out results); VerifyBreaks(new int[] { verse.Length, tssMultiV.Length - 1 }, results, "leading verse and QM"); Assert.AreEqual(2, segments.Count); VerifySegment(segments[0], 0, verse.Length, m_para.Hvo, "first seg of leading verse and QM"); VerifySegment(segments[1], verse.Length, tssMultiV.Length, m_para.Hvo, "second seg of leading verse and QM"); }
public void EllipsesAndRefs() { ITsStrFactory tsf = TsStrFactoryClass.Create(); ParagraphParser pp = new ParagraphParser(m_para); string test1 = "This is...not ... a simple sentence; it discusses Scripture (Gen 1.2 and Rom 1.2-4.5) and has ellipses."; ITsString tss = tsf.MakeString(test1, 1); m_para.Contents.UnderlyingTsString = tss; List<int> results; List<int> segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] {test1.Length - 1}, results, "ellipses verse period string"); Assert.AreEqual(1, segments.Count); VerifySegment(segments[0], 0, test1.Length, m_para.Hvo, "ellipses verse period"); string test2a = "Here we have"; string twoDots = ".."; string test2b = "just two periods, and at the end, another two"; tss = tsf.MakeString(test2a + twoDots + test2b + twoDots, 1); m_para.Contents.UnderlyingTsString = tss; segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { test2a.Length, test2a.Length + 2 + test2b.Length }, results, "string with double dots"); Assert.AreEqual(2, segments.Count); VerifySegment(segments[0], 0, test2a.Length + 2, m_para.Hvo, "string with double dots(1)"); VerifySegment(segments[1], test2a.Length + 2, tss.Length, m_para.Hvo, "string with double dots(2)"); string test3 = "This sentence ends with an ellipsis..."; tss = tsf.MakeString(test3, 1); m_para.Contents.UnderlyingTsString = tss; segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { }, results, "string with final ellipsis"); Assert.AreEqual(1, segments.Count); VerifySegment(segments[0], 0, test3.Length, m_para.Hvo, "string with final ellipsis"); string fourDots = "...."; tss = tsf.MakeString(test2a + fourDots + test2b + fourDots, 1); m_para.Contents.UnderlyingTsString = tss; segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { test2a.Length, test2a.Length + 4 + test2b.Length }, results, "string with four dots"); Assert.AreEqual(2, segments.Count); VerifySegment(segments[0], 0, test2a.Length + 4, m_para.Hvo, "string with four dots(1)"); VerifySegment(segments[1], test2a.Length + 4, tss.Length, m_para.Hvo, "string with four dots(2)"); // Case 2 periods with surrounding numbers string test5a = "Here is a number and two dots: 5"; string test5b = "2 and another number, and the final dot has a number before it: 2."; tss = tsf.MakeString(test5a + twoDots + test5b, 1); m_para.Contents.UnderlyingTsString = tss; segments = pp.CollectSegmentAnnotations(tss, out results); VerifyBreaks(new int[] { test5a.Length, test5a.Length + 2 + test5b.Length - 1 }, results, "string with numbers and double dots"); Assert.AreEqual(2, segments.Count); // One plus 2 for the two dots, but the following digit and space go in the previous segment, too. VerifySegment(segments[0], 0, test5a.Length + 2 + 2, m_para.Hvo, "string with numbers and double dots(1)"); VerifySegment(segments[1], test5a.Length + 2 + 2, tss.Length, m_para.Hvo, "string with numbers and double dots(2)"); }
public void BreakPhraseAnnotation() { List<int> newAnnotations = null; using (ParagraphParser phraseParser = new ParagraphParser(m_para)) { newAnnotations = phraseParser.BreakPhraseAnnotation(m_hvoCurrentAnnotation, ParasInView); this.HvoNewAnnotation = newAnnotations[0]; } }
virtual internal protected List<int> BreakPhrase(int iSegment, int iSegForm) { ParagraphParser pp = new ParagraphParser(m_para); List<int> newCbas = pp.BreakPhraseAnnotation(GetSegmentForm(iSegment, iSegForm), new int[] { m_para.Hvo }); // Recompute other segment forms that match this combination of phrases. NeedReparseParagraph = true; pp.Dispose(); return newCbas; }
private static void ParseTextCore(IStText text, ParagraphParserOptions options, ProgressState progress) { using (ParagraphParser pp = new ParagraphParser(text.Cache)) { pp.SetOptions(options); if (options.ResetConcordance) (text.Cache.LangProject.WordformInventoryOA as WordformInventory).ResetAllWordformOccurrences(); if (text.LastParsedTimestamp != 0) { // We actually have parsed before...yet we have to again. Possibly another program changed // the data. Reload it as efficiently as possible. string sql = "select Id, UpdStmp, Contents, Contents_Fmt from StTxtPara_ where Owner$ = " + text.Hvo + " order by OwnOrd$"; IDbColSpec dcs = DbColSpecClass.Create(); dcs.Push((int)DbColType.koctObjVecOwn, 0, (int)StText.StTextTags.kflidParagraphs, 0); dcs.Push((int)DbColType.koctTimeStamp, 1, 0, 0); dcs.Push((int)DbColType.koctString, 1, (int)StTxtPara.StTxtParaTags.kflidContents, 0); dcs.Push((int)DbColType.koctFmt, 1, (int)StTxtPara.StTxtParaTags.kflidContents, 0); text.Cache.VwOleDbDaAccessor.Load(sql, dcs, text.Hvo, 0, null, false); } pp.SalvageDummyAnnotations(text); pp.Parse(text, progress); text.RecordParseTimestamp(); pp.AddEntryGuesses(progress); pp.CleanupLeftoverAnnotations(progress); } }
private static void ParseParagraph(IStTxtPara para, int tagSegments, int tagSegForms, ParagraphParserOptions options) { using (ParagraphParser pp = new ParagraphParser(para.Cache, tagSegments, tagSegForms)) { pp.ParseWithOptions(para, options); } }
internal static List<int> ConcordParagraphs(FdoCache cache, int[] hvosStTxtPara, ProgressState progress, IMatcher matcher, ConcordanceControl.ConcordanceLines line) { using (ParagraphParser pp = new ParagraphParser(cache)) { // this will effectively clear ConcordanceWordforms, which seems overkill, but // since we are changing the occurrences on those wordforms, // and also possibly adding many new wordforms, we should just allow RecordLists that use // ConcordanceWordforms to reload the list. // (Enhance: is there any way we can make those lists be smart about when they need to reload, // rather than forcing them to?) (pp.m_wfi as WordformInventory).SuspendUpdatingConcordanceWordforms = true; pp.CreateDummyWordforms = true; pp.m_hvosStTxtPara = hvosStTxtPara; if (matcher != null) pp.m_matchingAnnotations = new List<int>(); ParagraphParser.ResetParseSessionDependentStaticData(); // Estimate the number of total number of milestones we'll set. // Enhance: we could construct a way to set percentage done based upon // number of texts and paragraphs in each text. if (progress is MilestoneProgressState) { MilestoneProgressState mp = progress as SIL.FieldWorks.Common.Controls.MilestoneProgressState; for (int i = 0; i < pp.m_hvosStTxtPara.Length; ++i) { mp.AddMilestone(1); } } // Preload all the paragraphs. cache.PreloadIfMissing(hvosStTxtPara, (int)StTxtPara.StTxtParaTags.kflidContents, 0, false); // Parse each text to load our paragraph and wordform segment annotations. int cPara = 0; using (SuppressSubTasks suppressor = new SuppressSubTasks(cache, true)) { foreach (IStTxtPara para in new FdoObjectSet<IStTxtPara>(cache, pp.m_hvosStTxtPara, false)) { ++cPara; pp.Parse(para, matcher, line); progress.SetMilestone(); progress.Breath(); if (pp.m_matchingAnnotations != null && pp.m_matchingAnnotations.Count >= ConcordanceControl.MaxConcordanceMatches()) { MessageBox.Show(String.Format(ITextStrings.ksShowingOnlyTheFirstXXXMatches, pp.m_matchingAnnotations.Count, cPara, pp.m_hvosStTxtPara.Length), ITextStrings.ksNotice, MessageBoxButtons.OK, MessageBoxIcon.Information); break; } } pp.CleanupLeftoverAnnotations(progress); } progress.SetMilestone(); progress.Breath(); (pp.m_wfi as WordformInventory).SuspendUpdatingConcordanceWordforms = false; return pp.m_matchingAnnotations; } }
internal SegmentMaker(ITsString text, ILgWritingSystemFactory wsf, ParagraphParser pp) : base(text, wsf) { m_paraParaser = pp; }
/// <summary> /// Parse through all the given texts, even if they've been fully analyzed. /// Collect occurrences of words and cache all paragraph and wordform related virtual properties. /// </summary> /// <param name="cache"></param> /// <param name="hvosStText">list of hvos for StText objects</param> public static void ConcordTexts(FdoCache cache, int[] hvosStText, ProgressState progress) { if (progress == null) progress = new NullProgressState(); using (ParagraphParser pp = new ParagraphParser(cache)) { #if PROFILING long ticks = DateTime.Now.Ticks; #endif // Ensure all info about paragraphs of texts and contents of paragraphs is in cache and current. // Enhance JohnT: possibly performance would be helped, especially in cases where we have a lot // of archived Scripture versions, by restricting this to just the texts in hvosStText. cache.LoadAllOfAnOwningVectorProp((int)StText.StTextTags.kflidParagraphs, "StText"); cache.LoadAllOfAStringProp((int)StTxtPara.StTxtParaTags.kflidContents); #if PROFILING Debug.WriteLine("Time to end of loading text data = " + (DateTime.Now.Ticks - ticks)); #endif pp.m_hvosStText = hvosStText; //// Get a list of all the paragraphs. //List<int> targetParagraphs = new List<int>(); //foreach (IStText text in new FdoObjectSet<IStText>(cache, pp.m_hvosStText, true)) //{ // targetParagraphs.AddRange(text.ParagraphsOS.HvoArray); //} pp.RebuildingConcordanceWordforms = true; WordformInventory wfi = (cache.LangProject.WordformInventoryOA as WordformInventory); wfi.ResetConcordanceWordformsAndOccurrences(); #if PROFILING Debug.WriteLine("Time to end of reset occurrenes = " + (DateTime.Now.Ticks - ticks)); #endif ParagraphParser.ResetParseSessionDependentStaticData(); // Estimate the number of total number of milestones we'll set. // Enhance: we could construct a way to set percentage done based upon // number of texts and paragraphs in each text. if (progress is MilestoneProgressState) { MilestoneProgressState mp = progress as SIL.FieldWorks.Common.Controls.MilestoneProgressState; for (int i = 0; i < pp.m_hvosStText.Length; ++i) { AddParseTextMilestones(mp); } } // Parse each text to load our paragraph and wordform segment annotations. using (SuppressSubTasks suppressor = new SuppressSubTasks(cache, true)) { List<IStText> texts = new List<IStText>(new FdoObjectSet<IStText>(cache, pp.m_hvosStText, false)); // Anything like this is currently redundant, we loaded the contents of ALL paragraphs above. //List<IStText> parsedTexts = texts.FindAll(HasLastParsedTimestamp); //if (parsedTexts.Count != 0) //{ // // We actually have parsed some texts before...yet we have to again. Possibly another program changed // // the data. Reload it as efficiently as possible. // int[] parsedHvos = new int[parsedTexts.Count]; // for (int i = 0; i < parsedHvos.Length; i++) // parsedHvos[i] = parsedTexts[i].Hvo; // int index = 0; // string Hvos = DbOps.MakePartialIdList(ref index, parsedHvos); // string whereClause = ""; // if (index == parsedHvos.Length) // { // // If we can make a single where clause we'll do it; otherwise do them all // whereClause = " where Owner$ in (" + Hvos + ")"; // } // string sql = "select Owner$, Id, UpdStmp, Contents, Contents_Fmt from StTxtPara_ " + whereClause + " order by owner$, OwnOrd$"; // IDbColSpec dcs = DbColSpecClass.Create(); // dcs.Push((int)DbColType.koctBaseId, 0, 0, 0); // dcs.Push((int)DbColType.koctObjVecOwn, 1, (int)StText.StTextTags.kflidParagraphs, 0); // dcs.Push((int)DbColType.koctTimeStamp, 2, 0, 0); // dcs.Push((int)DbColType.koctString, 2, (int)StTxtPara.StTxtParaTags.kflidContents, 0); // dcs.Push((int)DbColType.koctFmt, 2, (int)StTxtPara.StTxtParaTags.kflidContents, 0); // cache.VwOleDbDaAccessor.Load(sql, dcs, 0, 0, null, false); //} // Need a separate loop for these, otherwise things get confused as we start to reuse // annotations in pp.Parse() and then re-encounter them in later attempts to salvage Pfics and segments. #if PROFILING Debug.WriteLine("Time to end of preliminaries = " + (DateTime.Now.Ticks - ticks)); #endif foreach (IStText text in texts) pp.SalvageDummyAnnotations(text); #if PROFILING Debug.WriteLine("Time to start of main parse loop = " + (DateTime.Now.Ticks - ticks)); #endif foreach (IStText text in texts) { pp.Parse(text, progress); } #if PROFILING Debug.WriteLine("Time to end of main parse loop = " + (DateTime.Now.Ticks - ticks)); #endif StText.RecordParseTimestamps(texts); pp.CleanupLeftoverAnnotations(progress); } //Debug.WriteLine("Time for whole ConcordTexts = " + (DateTime.Now.Ticks - ticks)); progress.SetMilestone(); progress.Breath(); #if PROFILING Debug.WriteLine("Parse required " + pp.m_cDummyAnnotations + " dummy annotations" + " but could only reuse " + pp.m_dummyAnnotationsToReuse.Count); Debug.WriteLine(" Parse created " + pp.m_cWficsMade + " Wfics, " + pp.m_cPficsMade + " Pfics, and " + pp.m_cSegmentsMade + " Segments"); Debug.WriteLine(" So far we made a total of " + s_cTotalDummiesMade + "; this parse making dummies took " + pp.m_cTicksMakingDummies); Debug.WriteLine(" This parse we reset " + pp.m_cTotalDummiesReset + " in a time of " + pp.m_cTicksResettingDummies); #endif } }
/// <summary> /// Retrieve the wordforms collected during the last parsing session. /// </summary> /// <param name="cache"></param> /// <returns></returns> public static Set<int> WordformsFromLastParseSession(FdoCache cache) { Set<int> parsedWordforms = null; using (ParagraphParser pp = new ParagraphParser(cache)) { parsedWordforms = new Set<int>(pp.WordformIdOccurrencesTable); } return parsedWordforms; }
private List<int> GetSegments(ITsStrBldr bldr, ScrTxtPara para) { para.Contents.UnderlyingTsString = bldr.GetString(); ParagraphParser pp = new ParagraphParser(para); List<int> eosIndexes; List<int> segments = pp.CollectSegmentAnnotations(para.Contents.UnderlyingTsString, out eosIndexes); Cache.VwCacheDaAccessor.CacheVecProp(para.Hvo, ktagParaSegments, segments.ToArray(), segments.Count); return segments; }
/// <summary> /// Ensure that the segments property of the paragraph is consistent with its contents and consists of real /// database objects. /// </summary> internal static FdoCache EnsureMainParaSegments(IStTxtPara para, int wsBt) { ParagraphParser pp = new ParagraphParser(para); List<int> EosOffsets; List<int> segs = pp.CollectSegmentAnnotationsOfPara(out EosOffsets); // Make sure the segments list is up to date. FdoCache cache = para.Cache; cache.VwCacheDaAccessor.CacheVecProp(para.Hvo, StTxtPara.SegmentsFlid(cache), segs.ToArray(), segs.Count); // This further makes sure all are real. StTxtPara.LoadSegmentFreeTranslations(new int[] { para.Hvo }, cache, wsBt); return cache; }