コード例 #1
0
        /// <summary>
        /// Detect sentences in a string.
        /// </summary>
        /// <param name="input">
        /// The string to be processed.
        /// </param>
        /// <param name="positions">
        /// Start index and length of each detected sentence.
        /// </param>
        /// <returns>
        /// A string array containing individual sentences as elements.
        /// </returns>
        public string[] SentenceDetect(string input, out Util.Pair <int, int>[] positions)
        {
            int[] startsList = SentencePositionDetect(input);
            positions = new Util.Pair <int, int>[] { new Util.Pair <int, int>(0, input.Length) };
            if (startsList.Length == 0)
            {
                return(new string[] { input });
            }

            bool isLeftover = startsList[startsList.Length - 1] != input.Length;

            string[] sentences = new string[isLeftover ? startsList.Length + 1 : startsList.Length];
            positions    = new Util.Pair <int, int> [sentences.Length];
            positions[0] = new Util.Pair <int, int>(0, (startsList[0]) - (0));
            sentences[0] = input.Substring(positions[0].FirstValue, positions[0].SecondValue);

            for (int currentStart = 1; currentStart < startsList.Length; currentStart++)
            {
                positions[currentStart] = new Util.Pair <int, int>(startsList[currentStart - 1], (startsList[currentStart]) - (startsList[currentStart - 1]));
                sentences[currentStart] = input.Substring(positions[currentStart].FirstValue, positions[currentStart].SecondValue);
            }

            if (isLeftover)
            {
                positions[sentences.Length - 1] = new Util.Pair <int, int>(startsList[startsList.Length - 1], input.Length - startsList[startsList.Length - 1]);
                sentences[sentences.Length - 1] = input.Substring(positions[sentences.Length - 1].FirstValue);
            }

            return(sentences);
        }
コード例 #2
0
        /// <summary>
        /// Generates a Parse structure from the specified tree-bank style parse string.
        /// </summary>
        /// <param name="parse">
        /// A tree-bank style parse string.
        /// </param>
        /// <returns>
        /// a Parse structure for the specified tree-bank style parse string.
        /// </returns>
        public static Parse FromParseString(string parse)
        {
            StringBuilder textBuffer = new StringBuilder();
            int           offset     = 0;

            Stack <Util.Pair <string, int> > parseStack = new Stack <Util.Pair <string, int> >();

            List <Util.Pair <string, Util.Span> > consitutents = new List <Util.Pair <string, Util.Span> >();

            for (int currentChar = 0, charCount = parse.Length; currentChar < charCount; currentChar++)
            {
                char c = parse[currentChar];
                if (c == '(')
                {
                    string rest = parse.Substring(currentChar + 1);
                    string type = GetType(rest);
                    if (type == null)
                    {
                        throw new ParseException("null type for: " + rest);
                    }
                    string token = GetToken(rest);
                    parseStack.Push(new Util.Pair <string, int>(type, offset));
                    if ((object)token != null && type != "-NONE-")
                    {
                        consitutents.Add(new Util.Pair <string, Util.Span>(MaximumEntropyParser.TokenNode, new Util.Span(offset, offset + token.Length)));
                        textBuffer.Append(token).Append(" ");
                        offset += token.Length + 1;
                    }
                }
                else if (c == ')')
                {
                    Util.Pair <string, int> parts = parseStack.Pop();
                    string type = parts.FirstValue;
                    if (type != "-NONE-")
                    {
                        int start = parts.SecondValue;
                        consitutents.Add(new Util.Pair <string, Util.Span>(parts.FirstValue, new Util.Span(start, offset - 1)));
                    }
                }
            }
            string text      = textBuffer.ToString();
            Parse  rootParse = new Parse(text, new Util.Span(0, text.Length), MaximumEntropyParser.TopNode, 1);

            for (int currentConstituent = 0, constituentCount = consitutents.Count; currentConstituent < constituentCount; currentConstituent++)
            {
                Util.Pair <string, Util.Span> parts = consitutents[currentConstituent];
                string type = parts.FirstValue;
                if (type != MaximumEntropyParser.TopNode)
                {
                    Parse newConstituent = new Parse(text, parts.SecondValue, type, 1);
                    rootParse.Insert(newConstituent);
                }
            }
            return(rootParse);
        }
コード例 #3
0
ファイル: DecimationFilter.cs プロジェクト: Faham/emophiz
        /// <summary>
        /// Applies a nth-order Butterworth filter.
        /// </summary>
        /// <param name="factor"></param>
        /// <param name="filterOrder"></param>
        public DecimationFilter(int factor, int filterOrder, bool bidirectional)
        {
            this.factor = factor;

            Util.Pair<double,double> cornerFreqs = new Util.Pair<double,double>(Math.PI / factor, 0.0);
            FilterOrderSpec spec = FilterOrderSpec.CreateButterworthSpec(cornerFreqs, filterOrder, BandType.LowPass);

            Filter antiAliasFilter = FilterFactory.CreateIirFilter(spec);
            if(bidirectional)
                antiAliasFilter = new BidirectionalFilter(antiAliasFilter);

            Filter downSampleFilter = new DownSampleFilter(factor);
            filter = new FilterChain(new Filter[] { antiAliasFilter, downSampleFilter });
            return;
        }
        private void AddEvents(string line)
        {
            Util.Pair <ArrayList, ArrayList> linePair = ConvertAnnotatedString(line);
            ArrayList     tokens   = linePair.FirstValue;
            ArrayList     outcomes = linePair.SecondValue;
            List <string> tags     = new List <string>();

            for (int currentToken = 0; currentToken < tokens.Count; currentToken++)
            {
                string[] context = mContextGenerator.GetContext(currentToken, tokens.ToArray(), tags.ToArray(), null);
                SharpEntropy.TrainingEvent posTrainingEvent = new SharpEntropy.TrainingEvent((string)outcomes[currentToken], context);
                tags.Add((string)outcomes[currentToken]);
                mEventList.Add(posTrainingEvent);
            }
        }
        public static Util.Pair <ArrayList, ArrayList> ConvertAnnotatedString(string input)
        {
            ArrayList tokens   = new ArrayList();
            ArrayList outcomes = new ArrayList();

            Util.StringTokenizer tokenizer = new Util.StringTokenizer(input);
            string token = tokenizer.NextToken();

            while (token != null)
            {
                Util.Pair <string, string> linePair = Split(token);
                tokens.Add(linePair.FirstValue);
                outcomes.Add(linePair.SecondValue);
                token = tokenizer.NextToken();
            }
            return(new Util.Pair <ArrayList, ArrayList>(tokens, outcomes));
        }
コード例 #6
0
        /// <summary>
        /// Builds up the list of features based on the information in the object,
        /// which is a pair containing a string and and integer which
        /// indicates the index of the position we are investigating.
        /// </summary>
        public virtual string[] GetContext(Util.Pair <string, int> pair)
        {
            string data  = pair.FirstValue;
            int    index = pair.SecondValue;

            List <string> predicates = new List <string>();

            predicates.Add("p=" + data.Substring(0, (index) - (0)));
            predicates.Add("s=" + data.Substring(index));
            if (index > 0)
            {
                AddCharPredicates("p1", data[index - 1], predicates);
                if (index > 1)
                {
                    AddCharPredicates("p2", data[index - 2], predicates);
                    predicates.Add("p21=" + data[index - 2] + data[index - 1]);
                }
                else
                {
                    predicates.Add("p2=bok");
                }
                predicates.Add("p1f1=" + data[index - 1] + data[index]);
            }
            else
            {
                predicates.Add("p1=bok");
            }
            AddCharPredicates("f1", data[index], predicates);
            if (index + 1 < data.Length)
            {
                AddCharPredicates("f2", data[index + 1], predicates);
                predicates.Add("f12=" + data[index] + data[index + 1]);
            }
            else
            {
                predicates.Add("f2=bok");
            }
            if (data[0] == '&' && data[data.Length - 1] == ';')
            {
                predicates.Add("cc");                 //character code
            }

            return(predicates.ToArray());
        }
コード例 #7
0
        private void AddNewEvents(string token)
        {
            System.Text.StringBuilder buffer = mBuffer;
            buffer.Append(token.Trim());
            int sentenceEndPosition = buffer.Length - 1;

            //add following word to stringbuilder
            if (mNext != null && token.Length > 0)
            {
                int positionAfterFirstWordInNext = mNext.IndexOf(" ");
                if (positionAfterFirstWordInNext != -1)
                {
                    // should maybe changes this so that it usually adds a space
                    // before the next sentence, but sometimes leaves no space.
                    buffer.Append(" ");
                    buffer.Append(mNext.Substring(0, (positionAfterFirstWordInNext) - (0)));
                }
            }

            for (System.Collections.IEnumerator iterator = mScanner.GetPositions(buffer).GetEnumerator(); iterator.MoveNext();)
            {
                int candidate = (int)iterator.Current;
                Util.Pair <System.Text.StringBuilder, int> pair = new Util.Pair <System.Text.StringBuilder, int>(buffer, candidate);
                string type = (candidate == sentenceEndPosition) ? "T" : "F";
                SentenceDetectionEvent sentenceEvent = new SentenceDetectionEvent(type, mContextGenerator.GetContext(pair));

                if (null != mTail)
                {
                    mTail.NextEvent = sentenceEvent;
                    mTail           = sentenceEvent;
                }
                else if (null == mHead)
                {
                    mHead = sentenceEvent;
                }
                else if (null == mHead.NextEvent)
                {
                    mHead.NextEvent = mTail = sentenceEvent;
                }
            }

            buffer.Length = 0;
        }
コード例 #8
0
        /// <summary>
        /// Detect the position of the first words of sentences in a string.
        /// </summary>
        /// <param name="input">
        /// The string to be processed.
        /// </param>
        /// <returns>
        /// A integer array containing the positions of the end index of
        /// every sentence
        /// </returns>
        public virtual int[] SentencePositionDetect(string input)
        {
            if (mUnicodeMapping)
            {
                input = Utils.MapUnicodeChars(input);
            }

            double sentenceProbability = 1;

            mSentenceProbs.Clear();
            System.Text.StringBuilder buffer = new System.Text.StringBuilder(input);
            List <int> endersList            = mScanner.GetPositions(input);
            List <int> positions             = new List <int>(endersList.Count);

            for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++)
            {
                int candidate = endersList[currentEnder];
                int cInt      = candidate;

                // skip over the leading parts of non-token final delimiters
                int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1);
                if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace))
                {
                    continue;
                }

                Util.Pair <System.Text.StringBuilder, int> pair = new Util.Pair <System.Text.StringBuilder, int>(buffer, candidate);
                double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(pair));
                string   bestOutcome   = mModel.GetBestOutcome(probabilities);
                sentenceProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
                if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt))
                {
                    if (index != cInt)
                    {
                        positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)));                        //moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))));
                        mSentenceProbs.Add(probabilities[mModel.GetOutcomeIndex(bestOutcome)]);
                    }
                    index = cInt + 1;
                }
            }

            return(positions.ToArray());
        }
コード例 #9
0
        public virtual void LocalEvaluate(SharpEntropy.IMaximumEntropyModel posModel, System.IO.StreamReader reader, out double accuracy, out double sentenceAccuracy)
        {
            mPosModel = posModel;
            float total = 0, correct = 0, sentences = 0, sentencesCorrect = 0;

            System.IO.StreamReader sentenceReader = new System.IO.StreamReader(reader.BaseStream, System.Text.Encoding.UTF7);
            string line;

            while ((object)(line = sentenceReader.ReadLine()) != null)
            {
                sentences++;
                Util.Pair <ArrayList, ArrayList> annotatedPair = PosEventReader.ConvertAnnotatedString(line);
                ArrayList words    = annotatedPair.FirstValue;
                ArrayList outcomes = annotatedPair.SecondValue;
                ArrayList tags     = new ArrayList(Beam.BestSequence(words, null).Outcomes);

                int  count        = 0;
                bool isSentenceOK = true;
                for (System.Collections.IEnumerator tagIndex = tags.GetEnumerator(); tagIndex.MoveNext(); count++)
                {
                    total++;
                    string tag = (string)tagIndex.Current;
                    if (tag == (string)outcomes[count])
                    {
                        correct++;
                    }
                    else
                    {
                        isSentenceOK = false;
                    }
                }
                if (isSentenceOK)
                {
                    sentencesCorrect++;
                }
            }

            accuracy         = correct / total;
            sentenceAccuracy = sentencesCorrect / sentences;
        }
コード例 #10
0
        public virtual void CreatePdf(String dest)
        {
            //Initialize PDF document
            PdfDocument pdf = new PdfDocument(new PdfWriter(dest));
            // Initialize document
            Document document = new Document(pdf);
            PdfFont  font     = PdfFontFactory.CreateFont(StandardFonts.TIMES_ROMAN);
            PdfFont  bold     = PdfFontFactory.CreateFont(StandardFonts.HELVETICA_BOLD);

            document.SetTextAlignment(TextAlignment.JUSTIFIED).SetHyphenation(new HyphenationConfig("en", "uk", 3, 3))
            .SetFont(font).SetFontSize(11);
            StreamReader sr = File.OpenText(SRC);
            String       name;
            String       line;
            Paragraph    p;
            bool         title   = true;
            int          counter = 0;

            IList <Util.Pair <String, Util.Pair <String, int> > > toc = new List <Util.Pair
                                                                                  <String, Util.Pair <String, int> > >();

            while ((line = sr.ReadLine()) != null)
            {
                p = new Paragraph(line);
                p.SetKeepTogether(true);
                if (title)
                {
                    name = String.Format("title{0:00}", counter++);
                    Util.Pair <String, int> titlePage = new Util.Pair <string, int>(line, pdf.GetNumberOfPages());
                    p.SetFont(bold).SetFontSize(12).SetKeepWithNext(true).SetDestination(name).SetNextRenderer(new UpdatePageRenderer(p, titlePage));
                    title = false;
                    document.Add(p);
                    toc.Add(new Util.Pair <string, Util.Pair <string, int> >(name, titlePage));
                }
                else
                {
                    p.SetFirstLineIndent(36);
                    if (String.IsNullOrEmpty(line))
                    {
                        p.SetMarginBottom(12);
                        title = true;
                    }
                    else
                    {
                        p.SetMarginBottom(0);
                    }
                    document.Add(p);
                }
            }
            document.Add(new AreaBreak(AreaBreakType.NEXT_PAGE));
            p = new Paragraph().SetFont(bold).Add("Table of Contents").SetDestination("toc");
            document.Add(p);
            toc.RemoveAt(0);
            IList <TabStop> tabstops = new List <TabStop>();

            tabstops.Add(new TabStop(580, TabAlignment.RIGHT, new DottedLine()));
            foreach (Util.Pair <String, Util.Pair <String, int> > entry in toc)
            {
                Util.Pair <String, int> text = entry.Value;
                p = new Paragraph().AddTabStops(tabstops).Add(text.Key).Add(new Tab()).Add(text.Value.ToString()).SetAction
                        (PdfAction.CreateGoTo(entry.Key));
                document.Add(p);
            }
            //Close document
            document.Close();
        }
コード例 #11
0
 public UpdatePageRenderer(Paragraph modelElement, Util.Pair
                           <String, int> entry)
     : base(modelElement)
 {
     this.entry = entry;
 }
コード例 #12
0
        /// <summary>
        /// Builds up the list of features, anchored around a position within the
        /// StringBuilder.
        /// </summary>
        public virtual string[] GetContext(Util.Pair <System.Text.StringBuilder, int> pair)
        {
            string prefix;                      //string preceeding the eos character in the eos token.
            string previousToken;               //space delimited token preceding token containing eos character.
            string suffix;                      //string following the eos character in the eos token.
            string nextToken;                   //space delimited token following token containsing eos character.

            System.Text.StringBuilder buffer = pair.FirstValue;
            int position = pair.SecondValue;             //character offset of eos character in

            //if (first is string[])
            //{
            //    string[] firstList = (string[])first;
            //    previousToken = firstList[0];
            //    string current = firstList[1];
            //    prefix = current.Substring(0, (position) - (0));
            //    suffix = current.Substring(position + 1);
            //    if (suffix.StartsWith(" "))
            //    {
            //        mCollectFeatures.Add("sn");
            //    }
            //    if (prefix.EndsWith(" "))
            //    {
            //        mCollectFeatures.Add("pn");
            //    }
            //    mCollectFeatures.Add("eos=" + current[position]);
            //    nextToken = firstList[2];
            //}
            //else
            //{
            //    //compute previous, next, prefix and suffix strings and space previous, space next features and eos features.
            //    System.Text.StringBuilder buffer = (System.Text.StringBuilder)((Util.Pair)input).FirstValue;
            int lastIndex = buffer.Length - 1;

            // compute space previousToken and space next features.
            if (position > 0 && buffer[position - 1] == ' ')
            {
                mCollectFeatures.Add("sp");
            }
            if (position < lastIndex && buffer[position + 1] == ' ')
            {
                mCollectFeatures.Add("sn");
            }
            mCollectFeatures.Add("eos=" + buffer[position]);

            int prefixStart = PreviousSpaceIndex(buffer, position);

            int currentPosition = position;

            //assign prefix, stop if you run into a period though otherwise stop at space
            while (--currentPosition > prefixStart)
            {
                for (int currentEndOfSentenceCharacter = 0, endOfSentenceCharactersLength = mEndOfSentenceCharacters.Length; currentEndOfSentenceCharacter < endOfSentenceCharactersLength; currentEndOfSentenceCharacter++)
                {
                    if (buffer[currentPosition] == mEndOfSentenceCharacters[currentEndOfSentenceCharacter])
                    {
                        prefixStart = currentPosition;
                        currentPosition++;                         // this gets us out of while loop.
                        break;
                    }
                }
            }

            prefix = buffer.ToString(prefixStart, position - prefixStart).Trim();

            int previousStart = PreviousSpaceIndex(buffer, prefixStart);

            previousToken = buffer.ToString(previousStart, prefixStart - previousStart).Trim();

            int suffixEnd = NextSpaceIndex(buffer, position, lastIndex);

            currentPosition = position;
            while (++currentPosition < suffixEnd)
            {
                for (int currentEndOfSentenceCharacter = 0, endOfSentenceCharactersLength = mEndOfSentenceCharacters.Length; currentEndOfSentenceCharacter < endOfSentenceCharactersLength; currentEndOfSentenceCharacter++)
                {
                    if (buffer[currentPosition] == mEndOfSentenceCharacters[currentEndOfSentenceCharacter])
                    {
                        suffixEnd = currentPosition;
                        currentPosition--;                         // this gets us out of while loop.
                        break;
                    }
                }
            }

            int nextEnd = NextSpaceIndex(buffer, suffixEnd + 1, lastIndex + 1);

            if (position == lastIndex)
            {
                suffix    = "";
                nextToken = "";
            }
            else
            {
                suffix    = buffer.ToString(position + 1, suffixEnd - (position + 1)).Trim();
                nextToken = buffer.ToString(suffixEnd + 1, nextEnd - (suffixEnd + 1)).Trim();
            }

            mBuffer.Append("x=");
            mBuffer.Append(prefix);
            mCollectFeatures.Add(mBuffer.ToString());
            mBuffer.Length = 0;
            if (prefix.Length > 0)
            {
                mCollectFeatures.Add(System.Convert.ToString(prefix.Length, System.Globalization.CultureInfo.InvariantCulture));
                if (IsFirstUpper(prefix))
                {
                    mCollectFeatures.Add("xcap");
                }
                if (mInducedAbbreviations.Contains(prefix))
                {
                    mCollectFeatures.Add("xabbrev");
                }
            }

            mBuffer.Append("v=");
            mBuffer.Append(previousToken);
            mCollectFeatures.Add(mBuffer.ToString());
            mBuffer.Length = 0;
            if (previousToken.Length > 0)
            {
                if (IsFirstUpper(previousToken))
                {
                    mCollectFeatures.Add("vcap");
                }
                if (mInducedAbbreviations.Contains(previousToken))
                {
                    mCollectFeatures.Add("vabbrev");
                }
            }

            mBuffer.Append("s=");
            mBuffer.Append(suffix);
            mCollectFeatures.Add(mBuffer.ToString());
            mBuffer.Length = 0;
            if (suffix.Length > 0)
            {
                if (IsFirstUpper(suffix))
                {
                    mCollectFeatures.Add("scap");
                }
                if (mInducedAbbreviations.Contains(suffix))
                {
                    mCollectFeatures.Add("sabbrev");
                }
            }

            mBuffer.Append("n=");
            mBuffer.Append(nextToken);
            mCollectFeatures.Add(mBuffer.ToString());
            mBuffer.Length = 0;
            if (nextToken.Length > 0)
            {
                if (IsFirstUpper(nextToken))
                {
                    mCollectFeatures.Add("ncap");
                }
                if (mInducedAbbreviations.Contains(nextToken))
                {
                    mCollectFeatures.Add("nabbrev");
                }
            }

            string[] context = mCollectFeatures.ToArray();
            mCollectFeatures.Clear();
            return(context);
        }
コード例 #13
0
        public virtual void CreatePdf(String dest)
        {
            PdfDocument pdf = new PdfDocument(new PdfWriter(dest));

            pdf.GetCatalog().SetPageLayout(PdfName.TwoColumnRight);
            pdf.GetCatalog().SetPageMode(PdfName.UseThumbs);
            PdfPage page = pdf.AddNewPage();

            page.SetPageLabel(PageLabelNumberingStyle.LOWERCASE_ROMAN_NUMERALS, null);
            Document document = new Document(pdf);

            document.Add(new Paragraph().Add("Page left blank intentionally"));
            document.Add(new AreaBreak());
            document.Add(new Paragraph().Add("Page left blank intentionally"));
            document.Add(new AreaBreak());
            document.Add(new Paragraph().Add("Page left blank intentionally"));
            document.Add(new AreaBreak());
            page = pdf.GetLastPage();
            page.SetPageLabel(PageLabelNumberingStyle.DECIMAL_ARABIC_NUMERALS, null, 1);
            PdfFont font = PdfFontFactory.CreateFont(StandardFonts.TIMES_ROMAN);
            PdfFont bold = PdfFontFactory.CreateFont(StandardFonts.HELVETICA_BOLD);

            document.SetTextAlignment(TextAlignment.JUSTIFIED).SetHyphenation(new HyphenationConfig("en", "uk", 3, 3))
            .SetFont(font).SetFontSize(11);
            StreamReader sr = File.OpenText(SRC);
            String       name;
            String       line;
            Paragraph    p;
            bool         title   = true;
            int          counter = 0;
            IList <Util.Pair <String, Util.Pair <String, int> > > toc = new List <Util.Pair
                                                                                  <String, Util.Pair <String, int> > >();

            while ((line = sr.ReadLine()) != null)
            {
                p = new Paragraph(line);
                p.SetKeepTogether(true);
                if (title)
                {
                    name = String.Format("title{0:00}", counter++);
                    p.SetFont(bold).SetFontSize(12).SetKeepWithNext(true).SetDestination(name);
                    title = false;
                    document.Add(p);
                    toc.Add(new Util.Pair <string, Util.Pair <string, int> >(name, new Util.Pair <string, int>(line, pdf.GetNumberOfPages())));
                }
                else
                {
                    p.SetFirstLineIndent(36);
                    if (String.IsNullOrEmpty(line))
                    {
                        p.SetMarginBottom(12);
                        title = true;
                    }
                    else
                    {
                        p.SetMarginBottom(0);
                    }
                    document.Add(p);
                }
            }
            document.Add(new AreaBreak(AreaBreakType.NEXT_PAGE));
            p = new Paragraph().SetFont(bold).Add("Table of Contents").SetDestination("toc");
            document.Add(p);
            page = pdf.GetLastPage();
            page.SetPageLabel(null, "TOC", 1);
            toc.RemoveAt(0);
            IList <TabStop> tabstops = new List <TabStop>();

            tabstops.Add(new TabStop(580, TabAlignment.RIGHT, new DottedLine()));
            foreach (Util.Pair <String, Util.Pair <String, int> > entry in toc)
            {
                Util.Pair <String, int> text = entry.Value;
                p = new Paragraph().AddTabStops(tabstops).Add(text.Key).Add(new Tab()).Add(text.Value.ToString()).SetAction
                        (PdfAction.CreateGoTo(entry.Key));
                document.Add(p);
            }
            document.Close();
        }
コード例 #14
0
        /// <summary> 
        /// Detect the position of the first words of sentences in a string.
        /// </summary>
        /// <param name="input">
        /// The string to be processed.
        /// </param>
        /// <returns>
        /// A integer array containing the positions of the end index of
        /// every sentence
        /// </returns>
        public virtual int[] SentencePositionDetect(string input)
        {
            if (mUnicodeMapping) { input = Utils.MapUnicodeChars(input); }

            double sentenceProbability = 1;
            mSentenceProbs.Clear();
            System.Text.StringBuilder buffer = new System.Text.StringBuilder(input);
            List<int> endersList = mScanner.GetPositions(input);
            List<int> positions = new List<int>(endersList.Count);

            for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++)
            {
                int candidate = endersList[currentEnder];
                int cInt = candidate;

                // skip over the leading parts of non-token final delimiters
                int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1);
                if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace))
                {
                    continue;
                }

                Util.Pair<System.Text.StringBuilder, int> pair = new Util.Pair<System.Text.StringBuilder, int>(buffer, candidate);
                double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(pair));
                string bestOutcome = mModel.GetBestOutcome(probabilities);
                sentenceProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
                if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt))
                {
                    if (index != cInt)
                    {
                        positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)));//moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))));
                        mSentenceProbs.Add(probabilities[mModel.GetOutcomeIndex(bestOutcome)]);
                    }
                    index = cInt + 1;
                }
            }

            return positions.ToArray();
        }
コード例 #15
0
        /// <summary> 
        /// Detect sentences in a string.
        /// </summary>
        /// <param name="input">
        /// The string to be processed.
        /// </param>
        /// <param name="positions">
        /// Start index and length of each detected sentence.
        /// </param>
        /// <returns>   
        /// A string array containing individual sentences as elements.
        /// </returns>
        public string[] SentenceDetect(string input, out Util.Pair<int, int>[] positions)
        {
            int[] startsList = SentencePositionDetect(input);
            positions = new Util.Pair<int, int>[] { new Util.Pair<int, int>(0, input.Length) };
            if (startsList.Length == 0)
            {
                return new string[] {input};
            }

            bool isLeftover = startsList[startsList.Length - 1] != input.Length;
            string[] sentences = new string[isLeftover ? startsList.Length + 1 : startsList.Length];
            positions = new Util.Pair<int, int>[sentences.Length];
            positions[0] = new Util.Pair<int, int>(0, (startsList[0]) - (0));
            sentences[0] = input.Substring(positions[0].FirstValue, positions[0].SecondValue);

            for (int currentStart = 1; currentStart < startsList.Length; currentStart++)
            {
                positions[currentStart] = new Util.Pair<int, int>(startsList[currentStart - 1], (startsList[currentStart]) - (startsList[currentStart - 1]));
                sentences[currentStart] = input.Substring(positions[currentStart].FirstValue, positions[currentStart].SecondValue);
            }

            if (isLeftover)
            {
                positions[sentences.Length - 1] = new Util.Pair<int, int>(startsList[startsList.Length - 1], input.Length - startsList[startsList.Length - 1]);
                sentences[sentences.Length - 1] = input.Substring(positions[sentences.Length - 1].FirstValue);
            }

            return (sentences);
        }
コード例 #16
0
        private void AddNewEvents(string token)
        {
            System.Text.StringBuilder buffer = mBuffer;
            buffer.Append(token.Trim());
            int sentenceEndPosition = buffer.Length - 1;
            //add following word to stringbuilder
            if (mNext != null && token.Length > 0)
            {
                int positionAfterFirstWordInNext = mNext.IndexOf(" ");
                if (positionAfterFirstWordInNext != - 1)
                {
                    // should maybe changes this so that it usually adds a space
                    // before the next sentence, but sometimes leaves no space.
                    buffer.Append(" ");
                    buffer.Append(mNext.Substring(0, (positionAfterFirstWordInNext) - (0)));
                }
            }

            for (System.Collections.IEnumerator iterator = mScanner.GetPositions(buffer).GetEnumerator(); iterator.MoveNext(); )
            {
                int candidate = (int) iterator.Current;
                Util.Pair<System.Text.StringBuilder, int> pair = new Util.Pair<System.Text.StringBuilder, int>(buffer, candidate);
                string type = (candidate == sentenceEndPosition) ? "T" : "F";
                SentenceDetectionEvent sentenceEvent = new SentenceDetectionEvent(type, mContextGenerator.GetContext(pair));

                if (null != mTail)
                {
                    mTail.NextEvent = sentenceEvent;
                    mTail = sentenceEvent;
                }
                else if (null == mHead)
                {
                    mHead = sentenceEvent;
                }
                else if (null == mHead.NextEvent)
                {
                    mHead.NextEvent = mTail = sentenceEvent;
                }
            }

            buffer.Length = 0;
        }