예제 #1
0
		/// <summary>
		/// This routine is responsible for breaking a paragraph into segments. This is primarily done by looking for
		/// 'EOS' (end-of-segment) characters, which are various characters that usually end sentences, plus a special one
		/// which the user can insert into the text to force smaller segments. Also, we make separate segments for chapter
		/// and verse numbers (or whatever 'IsLabelText' identifies as labels).
		///
		/// Although we typically end a segment when we find an EOS character, things are actually a bit more complex.
		/// There may be various punctuation following the EOS character, such as quotes and parentheses. We don't actually make
		/// a segment break unless we find some word-forming characters (or label text) after the EOS, so the last segment
		/// can include any amount of trailing non-letter data. There might also be a good deal of non-letter data between
		/// the EOS and the following letter. The current algorithm, partly because numbers are likely to be labels of what
		/// follows and belong with it, is that (once we find a letter and decide to make a following segment) the break is
		/// at the end of the first run of white space following the EOS. If there is no white space, the segment break is
		/// right at the first letter that follows the EOS.
		///
		/// For label text, the segment break is always exactly at the start of a run that has the label style. White space
		/// following a label run is included in its segment, and multiple label-style runs (possibly separated by white
		/// space and including following white space) are merged into a single segment.
		///
		/// The algorithm also returns, for each segment except possibly the last, the character index of the first EOS
		/// character in the segment (or, for label segments or segments that end because of a label rather than an EOS
		/// character, the index of the character following the segment). This is helpful in adjusting segment boundaries
		/// because material inserted into a segment before the EOS is less likely to change the way the segments break
		/// up (unless of course it includes an EOS).
		/// </summary>
		/// <param name="tssText"></param>
		/// <param name="ichMinSegBreaks"></param>
		/// <returns></returns>
		internal IList<ISegment> CollectSegments(ITsString tssText, out List<int> ichMinSegBreaks)
		{
			Debug.Assert(m_para != null);
			// Get the information we need to reuse existing annotations if possible.
			m_preExistingSegs.Clear();
			m_preExistingSegs.AddRange(m_para.SegmentsOS);
			var collector = new SegmentMaker(tssText, m_cache.WritingSystemFactory, this);
			collector.Run();
			ichMinSegBreaks = collector.EosPositions;
			if (m_preExistingSegs.Count > 0)
			{
				// Delete left-over segments.
				// Enhance JohnT: should we copy their annotations into the last surviving segment if any?
				m_para.SegmentsOS.Replace(m_para.SegmentsOS.Count - m_preExistingSegs.Count, m_preExistingSegs.Count, new ICmObject[0]);
				m_preExistingSegs.Clear(); // I (JohnT) don't think it will be used again, but play safe
			}
			return collector.Segments;
		}
		/// ------------------------------------------------------------------------------------
		/// <summary>
		/// Fixes the paragraph's analysis.
		/// </summary>
		/// <param name="para">The paragraph.</param>
		/// ------------------------------------------------------------------------------------
		private static void FixParaAnalysis(IScrTxtPara para)
		{
			// If it has any word-level analysis, we need to reparse the whole text.
			if ((from segment in para.SegmentsOS where segment.AnalysesRS.Count > 0 select segment).FirstOrDefault() == null)
			{
				// No analyses; just resegment it.
				using (ParagraphParser parser = new ParagraphParser(para))
				{
					parser.CollectPreExistingParaAnnotations();
					SegmentMaker segmentMaker = new SegmentMaker(para.Contents, para.Cache.WritingSystemFactory, parser);
					segmentMaker.Run();
					if (segmentMaker.Segments.Count < para.SegmentsOS.Count)
					{
						// The paragraph has more segments than it should have, so remove any
						// extras that are floating around.
						for (int i = para.SegmentsOS.Count - 1; i >= segmentMaker.Segments.Count; i--)
							para.SegmentsOS.RemoveAt(i);
					}
				}
			}
			else
			{
				// Reparse the whole thing.
				ParagraphParser.ParseParagraph(para);
			}
		}