/// <summary> /// This routine is responsible for breaking a paragraph into segments. This is primarily done by looking for /// 'EOS' (end-of-segment) characters, which are various characters that usually end sentences, plus a special one /// which the user can insert into the text to force smaller segments. Also, we make separate segments for chapter /// and verse numbers (or whatever 'IsLabelText' identifies as labels). /// /// Although we typically end a segment when we find an EOS character, things are actually a bit more complex. /// There may be various punctuation following the EOS character, such as quotes and parentheses. We don't actually make /// a segment break unless we find some word-forming characters (or label text) after the EOS, so the last segment /// can include any amount of trailing non-letter data. There might also be a good deal of non-letter data between /// the EOS and the following letter. The current algorithm, partly because numbers are likely to be labels of what /// follows and belong with it, is that (once we find a letter and decide to make a following segment) the break is /// at the end of the first run of white space following the EOS. If there is no white space, the segment break is /// right at the first letter that follows the EOS. /// /// For label text, the segment break is always exactly at the start of a run that has the label style. White space /// following a label run is included in its segment, and multiple label-style runs (possibly separated by white /// space and including following white space) are merged into a single segment. /// /// The algorithm also returns, for each segment except possibly the last, the character index of the first EOS /// character in the segment (or, for label segments or segments that end because of a label rather than an EOS /// character, the index of the character following the segment). This is helpful in adjusting segment boundaries /// because material inserted into a segment before the EOS is less likely to change the way the segments break /// up (unless of course it includes an EOS). /// </summary> /// <param name="tssText"></param> /// <param name="ichMinSegBreaks"></param> /// <returns></returns> internal IList<ISegment> CollectSegments(ITsString tssText, out List<int> ichMinSegBreaks) { Debug.Assert(m_para != null); // Get the information we need to reuse existing annotations if possible. m_preExistingSegs.Clear(); m_preExistingSegs.AddRange(m_para.SegmentsOS); var collector = new SegmentMaker(tssText, m_cache.WritingSystemFactory, this); collector.Run(); ichMinSegBreaks = collector.EosPositions; if (m_preExistingSegs.Count > 0) { // Delete left-over segments. // Enhance JohnT: should we copy their annotations into the last surviving segment if any? m_para.SegmentsOS.Replace(m_para.SegmentsOS.Count - m_preExistingSegs.Count, m_preExistingSegs.Count, new ICmObject[0]); m_preExistingSegs.Clear(); // I (JohnT) don't think it will be used again, but play safe } return collector.Segments; }
/// ------------------------------------------------------------------------------------ /// <summary> /// Fixes the paragraph's analysis. /// </summary> /// <param name="para">The paragraph.</param> /// ------------------------------------------------------------------------------------ private static void FixParaAnalysis(IScrTxtPara para) { // If it has any word-level analysis, we need to reparse the whole text. if ((from segment in para.SegmentsOS where segment.AnalysesRS.Count > 0 select segment).FirstOrDefault() == null) { // No analyses; just resegment it. using (ParagraphParser parser = new ParagraphParser(para)) { parser.CollectPreExistingParaAnnotations(); SegmentMaker segmentMaker = new SegmentMaker(para.Contents, para.Cache.WritingSystemFactory, parser); segmentMaker.Run(); if (segmentMaker.Segments.Count < para.SegmentsOS.Count) { // The paragraph has more segments than it should have, so remove any // extras that are floating around. for (int i = para.SegmentsOS.Count - 1; i >= segmentMaker.Segments.Count; i--) para.SegmentsOS.RemoveAt(i); } } } else { // Reparse the whole thing. ParagraphParser.ParseParagraph(para); } }