/// <summary>A utility to ping an endpoint.</summary>
 /// <remarks>
 /// A utility to ping an endpoint. Useful for
 /// <see cref="Live()"/>
 /// and
 /// <see cref="Ready(bool)"/>
 /// .
 /// </remarks>
 /// <param name="uri">The URL we are trying to ping.</param>
 /// <returns>True if we got any non-5XX response from the endpoint.</returns>
 protected internal virtual bool Ping(string uri)
 {
     try
     {
         URL url = new URL(uri);
         HttpURLConnection connection = (HttpURLConnection)url.OpenConnection();
         connection.SetRequestProperty("Accept-Charset", "UTF-8");
         connection.SetRequestMethod("GET");
         connection.Connect();
         int code = connection.GetResponseCode();
         return(code < 500 || code >= 600);
     }
     catch (MalformedURLException)
     {
         log.Warn("Could not parse URL: " + uri);
         return(false);
     }
     catch (InvalidCastException)
     {
         log.Warn("Not an HTTP URI");
         return(false);
     }
     catch (IOException)
     {
         return(false);
     }
 }
Пример #2
0
        private IList <Tree> DoOneSentence(IList <ParserConstraint> constraints, IList <CoreLabel> words)
        {
            IParserQuery pq = parser.ParserQuery();

            pq.SetConstraints(constraints);
            pq.Parse(words);
            IList <Tree> trees = Generics.NewLinkedList();

            try
            {
                // Use bestParse if kBest is set to 1.
                if (this.kBest == 1)
                {
                    Tree t = pq.GetBestParse();
                    if (t == null)
                    {
                        log.Warn("Parsing of sentence failed.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
                    }
                    else
                    {
                        double score = pq.GetBestScore();
                        t.SetScore(score % -10000.0);
                        trees.Add(t);
                    }
                }
                else
                {
                    IList <ScoredObject <Tree> > scoredObjects = pq.GetKBestParses(this.kBest);
                    if (scoredObjects == null || scoredObjects.Count < 1)
                    {
                        log.Warn("Parsing of sentence failed.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
                    }
                    else
                    {
                        foreach (ScoredObject <Tree> so in scoredObjects)
                        {
                            // -10000 denotes unknown words
                            Tree tree = so.Object();
                            tree.SetScore(so.Score() % -10000.0);
                            trees.Add(tree);
                        }
                    }
                }
            }
            catch (OutOfMemoryException e)
            {
                log.Error(e);
                // Beware that we can now get an OOM in logging, too.
                log.Warn("Parsing of sentence ran out of memory (length=" + words.Count + ").  " + "Will ignore and try to continue.");
            }
            catch (NoSuchParseException)
            {
                log.Warn("Parsing of sentence failed, possibly because of out of memory.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
            }
            return(trees);
        }
Пример #3
0
		public ChineseNumberSequenceClassifier(Properties props, bool useSUTime, Properties sutimeProps)
			: base(props)
		{
			//import edu.stanford.nlp.pipeline.StanfordCoreNLP;
			this.useSUTime = useSUTime;
			if (this.useSUTime)
			{
				// TODO: Need a Chinese version of SUTime
				log.Warn("SUTime currently does not support Chinese. Ignore property ner.useSUTime.");
			}
			this.timexExtractor = null;
		}
        private static IList <Pair <TregexPattern, TsurgeonPattern> > LoadOps()
        {
            IList <Pair <TregexPattern, TsurgeonPattern> > ops = new List <Pair <TregexPattern, TsurgeonPattern> >();

            try
            {
                BufferedReader          br  = new BufferedReader(new StringReader(editStr));
                IList <TsurgeonPattern> tsp = new List <TsurgeonPattern>();
                for (string line; (line = br.ReadLine()) != null;)
                {
                    TregexPattern matchPattern = TregexPattern.Compile(line);
                    tsp.Clear();
                    while (Continuing(line = br.ReadLine()))
                    {
                        TsurgeonPattern p = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation(line);
                        tsp.Add(p);
                    }
                    if (!tsp.IsEmpty())
                    {
                        TsurgeonPattern tp = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.CollectOperations(tsp);
                        ops.Add(new Pair <TregexPattern, TsurgeonPattern>(matchPattern, tp));
                    }
                }
            }
            catch (IOException ioe)
            {
                // while not at end of file
                log.Warn(ioe);
            }
            return(ops);
        }
        private ICoreMap DoOneSentence(ICoreMap sentence)
        {
            IList <CoreLabel>  tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <TaggedWord> tagged = null;

            if (tokens.Count <= maxSentenceLength)
            {
                try
                {
                    tagged = pos.TagSentence(tokens, this.reuseTags);
                }
                catch (OutOfMemoryException e)
                {
                    log.Error(e);
                    // Beware that we can now get an OOM in logging, too.
                    log.Warn("Tagging of sentence ran out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(tokens));
                }
            }
            if (tagged != null)
            {
                for (int i = 0; i < sz; i++)
                {
                    tokens[i].Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), tagged[i].Tag());
                }
            }
            else
            {
                foreach (CoreLabel token in tokens)
                {
                    token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "X");
                }
            }
            return(sentence);
        }
Пример #6
0
        private void OpenURL(string url)
        {
            try
            {
                editorPane.SetPage(url);
            }
            catch (Exception e)
            {
                log.Info("Error loading |" + url + '|');
                log.Warn(e);
                DisplayError("Error Loading URL " + url, "Message: " + e);
                return;
            }
            loadedFile = null;
            string text = editorPane.GetText();

            taggedContents = null;
            if (!editorPane.GetContentType().Equals("text/html"))
            {
                editorPane.SetContentType("text/rtf");
                IDocument doc = editorPane.GetDocument();
                try
                {
                    doc.InsertString(0, text, defaultAttrSet);
                }
                catch (Exception e)
                {
                    throw new Exception(e);
                }
                editorPane.Revalidate();
                editorPane.Repaint();
                editorPane.SetEditable(true);
                htmlContents = null;
            }
            else
            {
                editorPane.SetEditable(false);
                htmlContents = editorPane.GetText();
            }
            saveUntagged.SetEnabled(false);
            saveTaggedAs.SetEnabled(false);
        }
Пример #7
0
        public virtual void TopSpeakerInRange(Annotation doc)
        {
            IList <CoreLabel> toks   = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            for (int quote_idx = 0; quote_idx < quotes.Count; quote_idx++)
            {
                ICoreMap quote = quotes[quote_idx];
                if (quote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation)) == null)
                {
                    Pair <int, int>           quoteRun = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                    IList <Sieve.MentionData> closestMentionsBackward = FindClosestMentionsInSpanBackward(new Pair <int, int>(Math.Max(0, quoteRun.first - BackwardWindow), quoteRun.first - 1));
                    IList <Sieve.MentionData> closestMentions         = FindClosestMentionsInSpanForward(new Pair <int, int>(quoteRun.second + 1, Math.Min(quoteRun.second + ForwardWindow, toks.Count - 1)));
                    Sharpen.Collections.AddAll(closestMentions, closestMentionsBackward);
                    Person.Gender  gender      = GetGender(MakeMentionData(quote));
                    IList <string> topSpeakers = Counters.ToSortedList(GetTopSpeakers(closestMentions, closestMentionsBackward, gender, quote, false));
                    //if none found, try again with bigger window
                    if (topSpeakers.IsEmpty())
                    {
                        closestMentionsBackward = FindClosestMentionsInSpanBackward(new Pair <int, int>(Math.Max(0, quoteRun.first - BackwardWindowBig), quoteRun.first - 1));
                        closestMentions         = FindClosestMentionsInSpanForward(new Pair <int, int>(quoteRun.second + 1, Math.Min(quoteRun.second + ForwardWindowBig, toks.Count - 1)));
                        topSpeakers             = Counters.ToSortedList(GetTopSpeakers(closestMentions, closestMentionsBackward, gender, quote, true));
                    }
                    if (topSpeakers.IsEmpty())
                    {
                        log.Warn("Watch out, there's an empty top speakers list!");
                        continue;
                    }
                    topSpeakers = RemoveQuoteNames(topSpeakers, quote);
                    string topSpeaker = topSpeakers[0];
                    Pair <string, string> nextPrediction = GetConversationalNextPrediction(quotes, quote_idx, gender);
                    bool set = UpdatePredictions(quote, nextPrediction);
                    if (set)
                    {
                        continue;
                    }
                    Pair <string, string> prevPrediction = GetConversationalPreviousPrediction(quotes, quote_idx, gender);
                    set = UpdatePredictions(quote, prevPrediction);
                    if (set)
                    {
                        continue;
                    }
                    Pair <string, string> famPrediction = GetFamilyAnimateVocative(quotes, quote_idx, gender, topSpeakers);
                    set = UpdatePredictions(quote, famPrediction);
                    if (set)
                    {
                        continue;
                    }
                    UpdatePredictions(quote, new Pair <string, string>(topSpeaker, string.Empty));
                }
            }
        }
        // = null;
        // = null;
        /// <summary>Try to set up the NER tagger.</summary>
        private static void SetupNERTagger()
        {
            Type NerTaggerClass;

            try
            {
                NerTaggerClass = Sharpen.Runtime.GetType(NerCombinerName);
            }
            catch (Exception)
            {
                log.Warn(NerCombinerName + " not found - not applying NER tags!");
                return;
            }
            try
            {
                MethodInfo createMethod = Sharpen.Runtime.GetDeclaredMethod(NerTaggerClass, "createNERClassifierCombiner", typeof(string), typeof(Properties));
                NerTagger         = createMethod.Invoke(null, null, new Properties());
                NerClassifyMethod = Sharpen.Runtime.GetDeclaredMethod(NerTaggerClass, "classify", typeof(IList));
            }
            catch (Exception)
            {
                log.Warn("Error setting up " + NerCombinerName + "! Not applying NER tags!");
            }
        }
 /// <summary>Add a word to the lexicon, unless it contains some non-Chinese character.</summary>
 private void AddStringToLexicon(string str)
 {
     if (str.Equals(string.Empty))
     {
         logger.Warn("WARNING: blank line in lexicon");
     }
     else
     {
         if (str.Contains(" "))
         {
             logger.Warn("WARNING: word with space in lexicon");
         }
         else
         {
             if (ExcludeChar(str))
             {
                 PrintlnErr("skipping word: " + str);
                 return;
             }
             // printlnErr("adding word: "+str);
             words.Add(str);
         }
     }
 }
        private Pair <IList <ICoreMap>, IList <T> > ApplyCompositeRule <_T0>(SequenceMatchRules.IExtractRule <IList <ICoreMap>, T> compositeExtractRule, IList <_T0> merged, IList <T> matchedExpressions, int limit)
            where _T0 : ICoreMap
        {
            // Apply higher order rules
            bool done = false;
            // Limit of number of times rules are applied just in case
            int maxIters = limit;
            int iters    = 0;

            while (!done)
            {
                IList <T> newExprs  = new List <T>();
                bool      extracted = compositeExtractRule.Extract(merged, newExprs);
                if (verbose && extracted)
                {
                    log.Info("applyCompositeRule() extracting with " + compositeExtractRule + " from " + merged + " gives " + newExprs);
                }
                if (extracted)
                {
                    AnnotateExpressions(merged, newExprs);
                    newExprs = MatchedExpression.RemoveNullValues(newExprs);
                    if (!newExprs.IsEmpty())
                    {
                        newExprs = MatchedExpression.RemoveNested(newExprs);
                        newExprs = MatchedExpression.RemoveOverlapping(newExprs);
                        merged   = MatchedExpression.ReplaceMerged(merged, newExprs);
                        // Favor newly matched expressions over older ones
                        Sharpen.Collections.AddAll(newExprs, matchedExpressions);
                        matchedExpressions = MatchedExpression.RemoveNested(newExprs);
                        matchedExpressions = MatchedExpression.RemoveOverlapping(matchedExpressions);
                    }
                    else
                    {
                        extracted = false;
                    }
                }
                done = !extracted;
                iters++;
                if (maxIters > 0 && iters >= maxIters)
                {
                    if (verbose)
                    {
                        log.Warn("Aborting application of composite rules: Maximum iteration " + maxIters + " reached");
                    }
                    break;
                }
            }
            return(new Pair <IList <ICoreMap>, IList <T> >(merged, matchedExpressions));
        }
Пример #11
0
        /// <summary>Segment input and write to output stream.</summary>
        /// <param name="segmenter"/>
        /// <param name="br"/>
        /// <param name="pwOut"/>
        /// <param name="nThreads"/>
        /// <returns>input characters processed per second</returns>
        private static double Decode(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter, BufferedReader br, PrintWriter pwOut, int nThreads)
        {
            System.Diagnostics.Debug.Assert(nThreads > 0);
            long nChars    = 0;
            long startTime = Runtime.NanoTime();

            if (nThreads > 1)
            {
                MulticoreWrapper <string, string> wrapper = new MulticoreWrapper <string, string>(nThreads, segmenter);
                try
                {
                    for (string line; (line = br.ReadLine()) != null;)
                    {
                        nChars += line.Length;
                        wrapper.Put(line);
                        while (wrapper.Peek())
                        {
                            pwOut.Println(wrapper.Poll());
                        }
                    }
                    wrapper.Join();
                    while (wrapper.Peek())
                    {
                        pwOut.Println(wrapper.Poll());
                    }
                }
                catch (IOException e)
                {
                    log.Warn(e);
                }
            }
            else
            {
                nChars = segmenter.Segment(br, pwOut);
            }
            long   duration    = Runtime.NanoTime() - startTime;
            double charsPerSec = (double)nChars / (duration / 1000000000.0);

            return(charsPerSec);
        }
Пример #12
0
        /// <summary>
        /// Read a gazette mapping in TokensRegex format from the given path
        /// The format is: 'case_sensitive_word \t target_ner_class' (additional info is ignored).
        /// </summary>
        /// <param name="mappingFile">The mapping file to read from, as a path either on the filesystem or in your classpath.</param>
        /// <returns>The mapping from word to NER tag.</returns>
        private static IDictionary <string, string> ReadRegexnerGazette(string mappingFile)
        {
            IDictionary <string, string> mapping = new Dictionary <string, string>();

            try
            {
                using (BufferedReader reader = IOUtils.ReaderFromString(mappingFile.Trim()))
                {
                    foreach (string line in IOUtils.SlurpReader(reader).Split("\n"))
                    {
                        string[] fields = line.Split("\t");
                        string   key    = fields[0];
                        string   target = fields[1];
                        mapping[key] = target;
                    }
                }
            }
            catch (IOException)
            {
                log.Warn("Could not read Regex mapping: " + mappingFile);
            }
            return(Java.Util.Collections.UnmodifiableMap(mapping));
        }
Пример #13
0
        public virtual void Annotate(Annotation annotation)
        {
            SUTime.TimeIndex timeIndex = new SUTime.TimeIndex();
            string           docDate   = annotation.Get(typeof(CoreAnnotations.DocDateAnnotation));

            if (docDate == null)
            {
                Calendar cal = annotation.Get(typeof(CoreAnnotations.CalendarAnnotation));
                if (cal == null)
                {
                    if (!quiet)
                    {
                        log.Warn("No document date specified");
                    }
                }
                else
                {
                    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd:hh:mm:ss");
                    docDate = dateFormat.Format(cal.GetTime());
                }
            }
            IList <ICoreMap> allTimeExpressions;
            // initialized below = null;
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            if (sentences != null)
            {
                allTimeExpressions = new List <ICoreMap>();
                IList <ICoreMap> allNumerics = new List <ICoreMap>();
                foreach (ICoreMap sentence in sentences)
                {
                    // make sure that token character offsets align with the actual sentence text
                    // They may not align due to token normalizations, such as "(" to "-LRB-".
                    ICoreMap alignedSentence = NumberSequenceClassifier.AlignSentence(sentence);
                    // uncomment the next line for verbose dumping of tokens....
                    // log.info("SENTENCE: " + ((ArrayCoreMap) sentence).toShorterString());
                    IList <ICoreMap> timeExpressions = timexExtractor.ExtractTimeExpressionCoreMaps(alignedSentence, docDate, timeIndex);
                    if (timeExpressions != null)
                    {
                        Sharpen.Collections.AddAll(allTimeExpressions, timeExpressions);
                        sentence.Set(typeof(TimeAnnotations.TimexAnnotations), timeExpressions);
                        foreach (ICoreMap timeExpression in timeExpressions)
                        {
                            timeExpression.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)));
                        }
                    }
                    IList <ICoreMap> numbers = alignedSentence.Get(typeof(CoreAnnotations.NumerizedTokensAnnotation));
                    if (numbers != null)
                    {
                        sentence.Set(typeof(CoreAnnotations.NumerizedTokensAnnotation), numbers);
                        Sharpen.Collections.AddAll(allNumerics, numbers);
                    }
                }
                annotation.Set(typeof(CoreAnnotations.NumerizedTokensAnnotation), allNumerics);
            }
            else
            {
                allTimeExpressions = AnnotateSingleSentence(annotation, docDate, timeIndex);
            }
            annotation.Set(typeof(TimeAnnotations.TimexAnnotations), allTimeExpressions);
        }
        /// <summary>
        /// reads in the features from a file, having already read the
        /// experiments
        /// </summary>
        public Features(string filename, Experiments domain)
        {
            Exception        e1            = new Exception("Incorrect data file format!");
            IIndex <IntPair> instanceIndex = domain.CreateIndex();

            try
            {
                using (BufferedReader @in = new BufferedReader(new FileReader(filename)))
                {
                    string s;
                    while (true)
                    {
                        s = @in.ReadLine();
                        if (s.Equals("<features>"))
                        {
                            break;
                        }
                    }
                    if (s == null)
                    {
                        throw e1;
                    }
                    s = @in.ReadLine();
                    if (!s.StartsWith("<fSize>"))
                    {
                        throw e1;
                    }
                    if (!s.EndsWith("</fSize>"))
                    {
                        throw e1;
                    }
                    int    index1 = s.IndexOf(">");
                    int    index2 = s.LastIndexOf("<");
                    string fSt    = Sharpen.Runtime.Substring(s, index1 + 1, index2);
                    System.Console.Out.WriteLine(fSt);
                    int number = System.Convert.ToInt32(fSt);
                    System.Console.Out.WriteLine("fSize is " + number);
                    int[]    arrIndexes = new int[maxValue];
                    double[] arrValues  = new double[maxValue];
                    for (int f = 0; f < number; f++)
                    {
                        string line    = @in.ReadLine();
                        int    indSp   = -1;
                        int    current = 0;
                        while ((indSp = line.IndexOf(" ")) > -1)
                        {
                            int x = System.Convert.ToInt32(Sharpen.Runtime.Substring(line, 0, indSp));
                            line  = Sharpen.Runtime.Substring(line, indSp + 1);
                            indSp = line.IndexOf(" ");
                            if (indSp == -1)
                            {
                                indSp = line.Length;
                            }
                            int y = System.Convert.ToInt32(Sharpen.Runtime.Substring(line, 0, indSp));
                            line  = Sharpen.Runtime.Substring(line, indSp + 1);
                            indSp = line.IndexOf(" ");
                            if (indSp == -1)
                            {
                                indSp = line.Length;
                            }
                            double val = double.ParseDouble(Sharpen.Runtime.Substring(line, 0, indSp));
                            if (indSp < line.Length)
                            {
                                line = Sharpen.Runtime.Substring(line, indSp + 1);
                            }
                            arrIndexes[current] = instanceIndex.IndexOf(new IntPair(x, y));
                            arrValues[current]  = val;
                            current++;
                        }
                        int[]    indValues = new int[current];
                        double[] values    = new double[current];
                        for (int j = 0; j < current; j++)
                        {
                            indValues[j] = arrIndexes[j];
                            values[j]    = arrValues[j];
                        }
                        Feature bf = new Feature(domain, indValues, values, instanceIndex);
                        this.Add(bf);
                    }
                }
            }
            catch (Exception e)
            {
                // for f
                log.Warn(e);
            }
        }
        public virtual IList <ICoreMap> ExtractTimeExpressionCoreMaps(ICoreMap annotation, ICoreMap docAnnotation)
        {
            SUTime.TimeIndex timeIndex;
            // initialized immediately below
            string docDate = null;

            if (docAnnotation != null)
            {
                timeIndex = docAnnotation.Get(typeof(TimeExpression.TimeIndexAnnotation));
                if (timeIndex == null)
                {
                    docAnnotation.Set(typeof(TimeExpression.TimeIndexAnnotation), timeIndex = new SUTime.TimeIndex());
                }
                // default look for the sentence's forum post date
                // if it doesn't have one, back off to the document date
                if (annotation.Get(typeof(CoreAnnotations.SectionDateAnnotation)) != null)
                {
                    docDate = annotation.Get(typeof(CoreAnnotations.SectionDateAnnotation));
                }
                else
                {
                    docDate = docAnnotation.Get(typeof(CoreAnnotations.DocDateAnnotation));
                }
                if (docDate == null)
                {
                    Calendar cal = docAnnotation.Get(typeof(CoreAnnotations.CalendarAnnotation));
                    if (cal == null)
                    {
                        if (options.verbose)
                        {
                            logger.Warn("WARNING: No document date specified");
                        }
                    }
                    else
                    {
                        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd:hh:mm:ss");
                        docDate = dateFormat.Format(cal.GetTime());
                    }
                }
            }
            else
            {
                timeIndex = new SUTime.TimeIndex();
            }
            if (StringUtils.IsNullOrEmpty(docDate))
            {
                docDate = null;
            }
            if (timeIndex.docDate == null && docDate != null)
            {
                try
                {
                    // TODO: have more robust parsing of document date?  docDate may not have century....
                    // TODO: if docDate didn't change, we can cache the parsing of the docDate and not repeat it for every sentence
                    timeIndex.docDate = SUTime.ParseDateTime(docDate, true);
                }
                catch (Exception e)
                {
                    throw new Exception("Could not parse date string: [" + docDate + "]", e);
                }
            }
            string sectionDate = annotation.Get(typeof(CoreAnnotations.SectionDateAnnotation));
            string refDate     = (sectionDate != null) ? sectionDate : docDate;

            return(ExtractTimeExpressionCoreMaps(annotation, refDate, timeIndex));
        }
        private void RunSegmentation(ICoreMap annotation)
        {
            //0 2
            // A BC D E
            // 1 10 1 1
            // 0 12 3 4
            // 0, 0+1 ,
            string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            // the original text String
            IList <CoreLabel> sentChars = annotation.Get(typeof(SegmenterCoreAnnotations.CharactersAnnotation));

            // the way it was divided by splitCharacters
            if (Verbose)
            {
                log.Info("sentChars (length " + sentChars.Count + ") is " + SentenceUtils.ListToString(sentChars, StringUtils.EmptyStringArray));
            }
            IList <CoreLabel> tokens = new List <CoreLabel>();

            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            // Run the segmenter! On the whole String. It knows not about the splitting into chars.
            // Can we change this to have it run directly on the already existing list of tokens. That would help, no?
            IList <string> words;

            if (!tokenizeNewline)
            {
                text  = text.ReplaceAll("[\r\n]", string.Empty);
                words = segmenter.SegmentString(text);
            }
            else
            {
                // remove leading and trailing newlines
                text = text.ReplaceAll("^[\\r\\n]+", string.Empty);
                text = text.ReplaceAll("[\\r\\n]+$", string.Empty);
                // if using the sentence split on two newlines option, replace single newlines
                // single newlines should be ignored for segmenting
                if (sentenceSplitOnTwoNewlines)
                {
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                    // do a second pass to handle corner case of consecutive isolated newlines
                    // x \n x \n x
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                }
                // Run the segmenter on each line so that we don't get tokens that cross line boundaries
                // Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432
                string[] lines = text.Split(string.Format("((?<=%1$s)|(?=%1$s))", separator));
                words = new List <string>();
                foreach (string line in lines)
                {
                    if (separatorPattern.Matcher(line).Matches())
                    {
                        // Don't segment newline tokens, keep them as-is
                        words.Add(line);
                    }
                    else
                    {
                        Sharpen.Collections.AddAll(words, segmenter.SegmentString(line));
                    }
                }
            }
            if (Verbose)
            {
                log.Info(text + "\n--->\n" + words + " (length " + words.Count + ')');
            }
            // Go through everything again and make the final tokens list; for loop is over segmented words
            int pos = 0;
            // This is used to index sentChars, the output from splitCharacters
            StringBuilder xmlBuffer = new StringBuilder();
            int           xmlBegin  = -1;

            foreach (string w in words)
            {
                CoreLabel fl = sentChars[pos];
                string    xmlCharAnnotation = fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation));
                if (Verbose)
                {
                    log.Info("Working on word " + w + ", sentChar " + fl.ToShorterString() + " (sentChars index " + pos + ')');
                }
                if ("0".Equals(xmlCharAnnotation) || "beginning".Equals(xmlCharAnnotation))
                {
                    // Beginnings of plain text and other XML tags are good places to end an XML tag
                    if (xmlBuffer.Length > 0)
                    {
                        // Form the XML token
                        string    xmlTag = xmlBuffer.ToString();
                        CoreLabel fl1    = sentChars[pos - 1];
                        int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                        tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
                        // Clean up and prepare for the next XML tag
                        xmlBegin  = -1;
                        xmlBuffer = new StringBuilder();
                    }
                }
                if (!"0".Equals(xmlCharAnnotation))
                {
                    // found an XML character; fl changes inside this loop!
                    while (fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)).Equals("whitespace"))
                    {
                        // Print whitespaces into the XML buffer and move on until the next non-whitespace character is found
                        // and we're in sync with segmenter output again
                        xmlBuffer.Append(' ');
                        pos += 1;
                        fl   = sentChars[pos];
                    }
                    xmlBuffer.Append(w);
                    pos = AdvancePos(sentChars, pos, w);
                    if (xmlBegin < 0)
                    {
                        xmlBegin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    }
                    continue;
                }
                // remember that fl may be more than one char long (non-BMP chars like emoji), so use advancePos()
                fl.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1");
                if (w.IsEmpty())
                {
                    if (Verbose)
                    {
                        log.Warn("Encountered an empty word. Shouldn't happen?");
                    }
                    continue;
                }
                // [cdm 2016:] surely this shouldn't happen!
                int begin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                pos = AdvancePos(sentChars, pos, w);
                if (pos - 1 >= sentChars.Count)
                {
                    log.Error("Error: on word " + w + " at position " + (pos - w.Length) + " trying to get at position " + (pos - 1));
                    log.Error("last element of sentChars is " + sentChars[sentChars.Count - 1]);
                }
                else
                {
                    fl = sentChars[pos - 1];
                    int end = fl.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    tokens.Add(MakeXmlToken(w, false, begin, end));
                }
            }
            // end for (go through everything again)
            if (xmlBuffer.Length > 0)
            {
                // Form the last XML token, if any
                string    xmlTag = xmlBuffer.ToString();
                CoreLabel fl1    = sentChars[pos - 1];
                int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
            }
            if (Verbose)
            {
                foreach (CoreLabel token in tokens)
                {
                    log.Info(token.ToShorterString());
                }
            }
        }
Пример #17
0
        // 1. Create the input
        // 1.1 Create a protocol buffer
        // 1.2 Create the query params
        // 2. Create a connection
        // 3. Do the annotation
        //    This method has two contracts:
        //    1. It should call the two relevant callbacks
        //    2. It must not throw an exception
        /// <summary>Actually try to perform the annotation on the server side.</summary>
        /// <remarks>
        /// Actually try to perform the annotation on the server side.
        /// This is factored out so that we can retry up to 3 times.
        /// </remarks>
        /// <param name="annotation">The annotation we need to fill.</param>
        /// <param name="backend">The backend we are querying against.</param>
        /// <param name="serverURL">The URL of the server we are hitting.</param>
        /// <param name="message">The message we are sending the server (don't need to recompute each retry).</param>
        /// <param name="tries">The number of times we've tried already.</param>
        private void DoAnnotation(Annotation annotation, StanfordCoreNLPClient.Backend backend, URL serverURL, byte[] message, int tries)
        {
            try
            {
                // 1. Set up the connection
                URLConnection connection = serverURL.OpenConnection();
                // 1.1 Set authentication
                if (apiKey != null && apiSecret != null)
                {
                    string userpass  = apiKey + ":" + apiSecret;
                    string basicAuth = "Basic " + Sharpen.Runtime.GetStringForBytes(Base64.GetEncoder().Encode(Sharpen.Runtime.GetBytesForString(userpass)));
                    connection.SetRequestProperty("Authorization", basicAuth);
                }
                // 1.2 Set some protocol-independent properties
                connection.SetDoOutput(true);
                connection.SetRequestProperty("Content-Type", "application/x-protobuf");
                connection.SetRequestProperty("Content-Length", int.ToString(message.Length));
                connection.SetRequestProperty("Accept-Charset", "utf-8");
                connection.SetRequestProperty("User-Agent", typeof(StanfordCoreNLPClient).FullName);
                switch (backend.protocol)
                {
                case "https":
                case "http":
                {
                    // 1.3 Set some protocol-dependent properties
                    ((HttpURLConnection)connection).SetRequestMethod("POST");
                    break;
                }

                default:
                {
                    throw new InvalidOperationException("Haven't implemented protocol: " + backend.protocol);
                }
                }
                // 2. Annotate
                // 2.1. Fire off the request
                connection.Connect();
                connection.GetOutputStream().Write(message);
                connection.GetOutputStream().Flush();
                // 2.2 Await a response
                // -- It might be possible to send more than one message, but we are not going to do that.
                Annotation response = serializer.Read(connection.GetInputStream()).first;
                // 2.3. Copy response over to original annotation
                foreach (Type key in response.KeySet())
                {
                    annotation.Set(key, response.Get(key));
                }
            }
            catch (Exception t)
            {
                // 3. We encountered an error -- retry
                if (tries < 3)
                {
                    log.Warn(t);
                    DoAnnotation(annotation, backend, serverURL, message, tries + 1);
                }
                else
                {
                    throw new Exception(t);
                }
            }
        }
Пример #18
0
        private static string NormalizeBMP(string @in, int ascii, int spaceChar, int midDot)
        {
            StringBuilder @out = new StringBuilder();
            int           len  = @in.Length;

            for (int i = 0; i < len; i++)
            {
                char cp = @in[i];
                if (char.IsHighSurrogate(cp))
                {
                    if (i + 1 < len)
                    {
                        log.Warn("ChineseUtils.normalize warning: non-BMP codepoint U+" + int.ToHexString(char.CodePointAt(@in, i)) + " in " + @in);
                    }
                    else
                    {
                        log.Warn("ChineseUtils.normalize warning: unmatched high surrogate character U+" + int.ToHexString(char.CodePointAt(@in, i)) + " in " + @in);
                    }
                }
                Character.UnicodeBlock cub = Character.UnicodeBlock.Of(cp);
                if (cub == Character.UnicodeBlock.PrivateUseArea || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaA || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaB)
                {
                    EncodingPrintWriter.Err.Println("ChineseUtils.normalize warning: private use area codepoint U+" + int.ToHexString(cp) + " in " + @in);
                }
                bool delete = false;
                switch (ascii)
                {
                case Leave:
                {
                    break;
                }

                case Ascii:
                {
                    if (cp >= '\uFF01' && cp <= '\uFF5E')
                    {
                        cp -= (char)(unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020)));
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (cp >= '\u0021' && cp <= '\u007E')
                    {
                        cp += (char)(unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020)));
                    }
                    break;
                }

                default:
                {
                    throw new ArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii);
                }
                }
                switch (spaceChar)
                {
                case Leave:
                {
                    break;
                }

                case Ascii:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        cp = ' ';
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        cp = '\u3000';
                    }
                    break;
                }

                case Delete:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        delete = true;
                    }
                    break;
                }

                case DeleteExceptBetweenAscii:
                {
                    char cpp = 0;
                    if (i > 0)
                    {
                        cpp = @in[i - 1];
                    }
                    char cpn = 0;
                    if (i < (len - 1))
                    {
                        cpn = @in[i + 1];
                    }
                    // EncodingPrintWriter.out.println("cp: " + cp + "; cpp: " + cpp + "cpn: " + cpn +
                    //      "; isSpace: " + Character.isSpaceChar(cp) + "; isAsciiLHL: " + isAsciiLowHigh(cpp) +
                    //      "; isAsciiLHR: " + isAsciiLowHigh(cpn), "UTF-8");
                    if (char.IsSpaceChar(cp) && !(IsAsciiLowHigh(cpp) && IsAsciiLowHigh(cpn)))
                    {
                        delete = true;
                    }
                    break;
                }
                }
                switch (midDot)
                {
                case Leave:
                {
                    break;
                }

                case Normalize:
                {
                    if (IsMidDot(cp))
                    {
                        cp = '\u00B7';
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (IsMidDot(cp))
                    {
                        cp = '\u30FB';
                    }
                    break;
                }

                case Delete:
                {
                    if (IsMidDot(cp))
                    {
                        delete = true;
                    }
                    break;
                }

                default:
                {
                    throw new ArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot);
                }
                }
                if (!delete)
                {
                    @out.Append(cp);
                }
            }
            // end for
            return(@out.ToString());
        }