/// <summary>A utility to ping an endpoint.</summary> /// <remarks> /// A utility to ping an endpoint. Useful for /// <see cref="Live()"/> /// and /// <see cref="Ready(bool)"/> /// . /// </remarks> /// <param name="uri">The URL we are trying to ping.</param> /// <returns>True if we got any non-5XX response from the endpoint.</returns> protected internal virtual bool Ping(string uri) { try { URL url = new URL(uri); HttpURLConnection connection = (HttpURLConnection)url.OpenConnection(); connection.SetRequestProperty("Accept-Charset", "UTF-8"); connection.SetRequestMethod("GET"); connection.Connect(); int code = connection.GetResponseCode(); return(code < 500 || code >= 600); } catch (MalformedURLException) { log.Warn("Could not parse URL: " + uri); return(false); } catch (InvalidCastException) { log.Warn("Not an HTTP URI"); return(false); } catch (IOException) { return(false); } }
private IList <Tree> DoOneSentence(IList <ParserConstraint> constraints, IList <CoreLabel> words) { IParserQuery pq = parser.ParserQuery(); pq.SetConstraints(constraints); pq.Parse(words); IList <Tree> trees = Generics.NewLinkedList(); try { // Use bestParse if kBest is set to 1. if (this.kBest == 1) { Tree t = pq.GetBestParse(); if (t == null) { log.Warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } else { double score = pq.GetBestScore(); t.SetScore(score % -10000.0); trees.Add(t); } } else { IList <ScoredObject <Tree> > scoredObjects = pq.GetKBestParses(this.kBest); if (scoredObjects == null || scoredObjects.Count < 1) { log.Warn("Parsing of sentence failed. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } else { foreach (ScoredObject <Tree> so in scoredObjects) { // -10000 denotes unknown words Tree tree = so.Object(); tree.SetScore(so.Score() % -10000.0); trees.Add(tree); } } } } catch (OutOfMemoryException e) { log.Error(e); // Beware that we can now get an OOM in logging, too. log.Warn("Parsing of sentence ran out of memory (length=" + words.Count + "). " + "Will ignore and try to continue."); } catch (NoSuchParseException) { log.Warn("Parsing of sentence failed, possibly because of out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(words)); } return(trees); }
public ChineseNumberSequenceClassifier(Properties props, bool useSUTime, Properties sutimeProps) : base(props) { //import edu.stanford.nlp.pipeline.StanfordCoreNLP; this.useSUTime = useSUTime; if (this.useSUTime) { // TODO: Need a Chinese version of SUTime log.Warn("SUTime currently does not support Chinese. Ignore property ner.useSUTime."); } this.timexExtractor = null; }
private static IList <Pair <TregexPattern, TsurgeonPattern> > LoadOps() { IList <Pair <TregexPattern, TsurgeonPattern> > ops = new List <Pair <TregexPattern, TsurgeonPattern> >(); try { BufferedReader br = new BufferedReader(new StringReader(editStr)); IList <TsurgeonPattern> tsp = new List <TsurgeonPattern>(); for (string line; (line = br.ReadLine()) != null;) { TregexPattern matchPattern = TregexPattern.Compile(line); tsp.Clear(); while (Continuing(line = br.ReadLine())) { TsurgeonPattern p = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ParseOperation(line); tsp.Add(p); } if (!tsp.IsEmpty()) { TsurgeonPattern tp = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.CollectOperations(tsp); ops.Add(new Pair <TregexPattern, TsurgeonPattern>(matchPattern, tp)); } } } catch (IOException ioe) { // while not at end of file log.Warn(ioe); } return(ops); }
private ICoreMap DoOneSentence(ICoreMap sentence) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <TaggedWord> tagged = null; if (tokens.Count <= maxSentenceLength) { try { tagged = pos.TagSentence(tokens, this.reuseTags); } catch (OutOfMemoryException e) { log.Error(e); // Beware that we can now get an OOM in logging, too. log.Warn("Tagging of sentence ran out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(tokens)); } } if (tagged != null) { for (int i = 0; i < sz; i++) { tokens[i].Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), tagged[i].Tag()); } } else { foreach (CoreLabel token in tokens) { token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "X"); } } return(sentence); }
private void OpenURL(string url) { try { editorPane.SetPage(url); } catch (Exception e) { log.Info("Error loading |" + url + '|'); log.Warn(e); DisplayError("Error Loading URL " + url, "Message: " + e); return; } loadedFile = null; string text = editorPane.GetText(); taggedContents = null; if (!editorPane.GetContentType().Equals("text/html")) { editorPane.SetContentType("text/rtf"); IDocument doc = editorPane.GetDocument(); try { doc.InsertString(0, text, defaultAttrSet); } catch (Exception e) { throw new Exception(e); } editorPane.Revalidate(); editorPane.Repaint(); editorPane.SetEditable(true); htmlContents = null; } else { editorPane.SetEditable(false); htmlContents = editorPane.GetText(); } saveUntagged.SetEnabled(false); saveTaggedAs.SetEnabled(false); }
public virtual void TopSpeakerInRange(Annotation doc) { IList <CoreLabel> toks = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); for (int quote_idx = 0; quote_idx < quotes.Count; quote_idx++) { ICoreMap quote = quotes[quote_idx]; if (quote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation)) == null) { Pair <int, int> quoteRun = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation))); IList <Sieve.MentionData> closestMentionsBackward = FindClosestMentionsInSpanBackward(new Pair <int, int>(Math.Max(0, quoteRun.first - BackwardWindow), quoteRun.first - 1)); IList <Sieve.MentionData> closestMentions = FindClosestMentionsInSpanForward(new Pair <int, int>(quoteRun.second + 1, Math.Min(quoteRun.second + ForwardWindow, toks.Count - 1))); Sharpen.Collections.AddAll(closestMentions, closestMentionsBackward); Person.Gender gender = GetGender(MakeMentionData(quote)); IList <string> topSpeakers = Counters.ToSortedList(GetTopSpeakers(closestMentions, closestMentionsBackward, gender, quote, false)); //if none found, try again with bigger window if (topSpeakers.IsEmpty()) { closestMentionsBackward = FindClosestMentionsInSpanBackward(new Pair <int, int>(Math.Max(0, quoteRun.first - BackwardWindowBig), quoteRun.first - 1)); closestMentions = FindClosestMentionsInSpanForward(new Pair <int, int>(quoteRun.second + 1, Math.Min(quoteRun.second + ForwardWindowBig, toks.Count - 1))); topSpeakers = Counters.ToSortedList(GetTopSpeakers(closestMentions, closestMentionsBackward, gender, quote, true)); } if (topSpeakers.IsEmpty()) { log.Warn("Watch out, there's an empty top speakers list!"); continue; } topSpeakers = RemoveQuoteNames(topSpeakers, quote); string topSpeaker = topSpeakers[0]; Pair <string, string> nextPrediction = GetConversationalNextPrediction(quotes, quote_idx, gender); bool set = UpdatePredictions(quote, nextPrediction); if (set) { continue; } Pair <string, string> prevPrediction = GetConversationalPreviousPrediction(quotes, quote_idx, gender); set = UpdatePredictions(quote, prevPrediction); if (set) { continue; } Pair <string, string> famPrediction = GetFamilyAnimateVocative(quotes, quote_idx, gender, topSpeakers); set = UpdatePredictions(quote, famPrediction); if (set) { continue; } UpdatePredictions(quote, new Pair <string, string>(topSpeaker, string.Empty)); } } }
// = null; // = null; /// <summary>Try to set up the NER tagger.</summary> private static void SetupNERTagger() { Type NerTaggerClass; try { NerTaggerClass = Sharpen.Runtime.GetType(NerCombinerName); } catch (Exception) { log.Warn(NerCombinerName + " not found - not applying NER tags!"); return; } try { MethodInfo createMethod = Sharpen.Runtime.GetDeclaredMethod(NerTaggerClass, "createNERClassifierCombiner", typeof(string), typeof(Properties)); NerTagger = createMethod.Invoke(null, null, new Properties()); NerClassifyMethod = Sharpen.Runtime.GetDeclaredMethod(NerTaggerClass, "classify", typeof(IList)); } catch (Exception) { log.Warn("Error setting up " + NerCombinerName + "! Not applying NER tags!"); } }
/// <summary>Add a word to the lexicon, unless it contains some non-Chinese character.</summary> private void AddStringToLexicon(string str) { if (str.Equals(string.Empty)) { logger.Warn("WARNING: blank line in lexicon"); } else { if (str.Contains(" ")) { logger.Warn("WARNING: word with space in lexicon"); } else { if (ExcludeChar(str)) { PrintlnErr("skipping word: " + str); return; } // printlnErr("adding word: "+str); words.Add(str); } } }
private Pair <IList <ICoreMap>, IList <T> > ApplyCompositeRule <_T0>(SequenceMatchRules.IExtractRule <IList <ICoreMap>, T> compositeExtractRule, IList <_T0> merged, IList <T> matchedExpressions, int limit) where _T0 : ICoreMap { // Apply higher order rules bool done = false; // Limit of number of times rules are applied just in case int maxIters = limit; int iters = 0; while (!done) { IList <T> newExprs = new List <T>(); bool extracted = compositeExtractRule.Extract(merged, newExprs); if (verbose && extracted) { log.Info("applyCompositeRule() extracting with " + compositeExtractRule + " from " + merged + " gives " + newExprs); } if (extracted) { AnnotateExpressions(merged, newExprs); newExprs = MatchedExpression.RemoveNullValues(newExprs); if (!newExprs.IsEmpty()) { newExprs = MatchedExpression.RemoveNested(newExprs); newExprs = MatchedExpression.RemoveOverlapping(newExprs); merged = MatchedExpression.ReplaceMerged(merged, newExprs); // Favor newly matched expressions over older ones Sharpen.Collections.AddAll(newExprs, matchedExpressions); matchedExpressions = MatchedExpression.RemoveNested(newExprs); matchedExpressions = MatchedExpression.RemoveOverlapping(matchedExpressions); } else { extracted = false; } } done = !extracted; iters++; if (maxIters > 0 && iters >= maxIters) { if (verbose) { log.Warn("Aborting application of composite rules: Maximum iteration " + maxIters + " reached"); } break; } } return(new Pair <IList <ICoreMap>, IList <T> >(merged, matchedExpressions)); }
/// <summary>Segment input and write to output stream.</summary> /// <param name="segmenter"/> /// <param name="br"/> /// <param name="pwOut"/> /// <param name="nThreads"/> /// <returns>input characters processed per second</returns> private static double Decode(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter, BufferedReader br, PrintWriter pwOut, int nThreads) { System.Diagnostics.Debug.Assert(nThreads > 0); long nChars = 0; long startTime = Runtime.NanoTime(); if (nThreads > 1) { MulticoreWrapper <string, string> wrapper = new MulticoreWrapper <string, string>(nThreads, segmenter); try { for (string line; (line = br.ReadLine()) != null;) { nChars += line.Length; wrapper.Put(line); while (wrapper.Peek()) { pwOut.Println(wrapper.Poll()); } } wrapper.Join(); while (wrapper.Peek()) { pwOut.Println(wrapper.Poll()); } } catch (IOException e) { log.Warn(e); } } else { nChars = segmenter.Segment(br, pwOut); } long duration = Runtime.NanoTime() - startTime; double charsPerSec = (double)nChars / (duration / 1000000000.0); return(charsPerSec); }
/// <summary> /// Read a gazette mapping in TokensRegex format from the given path /// The format is: 'case_sensitive_word \t target_ner_class' (additional info is ignored). /// </summary> /// <param name="mappingFile">The mapping file to read from, as a path either on the filesystem or in your classpath.</param> /// <returns>The mapping from word to NER tag.</returns> private static IDictionary <string, string> ReadRegexnerGazette(string mappingFile) { IDictionary <string, string> mapping = new Dictionary <string, string>(); try { using (BufferedReader reader = IOUtils.ReaderFromString(mappingFile.Trim())) { foreach (string line in IOUtils.SlurpReader(reader).Split("\n")) { string[] fields = line.Split("\t"); string key = fields[0]; string target = fields[1]; mapping[key] = target; } } } catch (IOException) { log.Warn("Could not read Regex mapping: " + mappingFile); } return(Java.Util.Collections.UnmodifiableMap(mapping)); }
public virtual void Annotate(Annotation annotation) { SUTime.TimeIndex timeIndex = new SUTime.TimeIndex(); string docDate = annotation.Get(typeof(CoreAnnotations.DocDateAnnotation)); if (docDate == null) { Calendar cal = annotation.Get(typeof(CoreAnnotations.CalendarAnnotation)); if (cal == null) { if (!quiet) { log.Warn("No document date specified"); } } else { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd:hh:mm:ss"); docDate = dateFormat.Format(cal.GetTime()); } } IList <ICoreMap> allTimeExpressions; // initialized below = null; IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); if (sentences != null) { allTimeExpressions = new List <ICoreMap>(); IList <ICoreMap> allNumerics = new List <ICoreMap>(); foreach (ICoreMap sentence in sentences) { // make sure that token character offsets align with the actual sentence text // They may not align due to token normalizations, such as "(" to "-LRB-". ICoreMap alignedSentence = NumberSequenceClassifier.AlignSentence(sentence); // uncomment the next line for verbose dumping of tokens.... // log.info("SENTENCE: " + ((ArrayCoreMap) sentence).toShorterString()); IList <ICoreMap> timeExpressions = timexExtractor.ExtractTimeExpressionCoreMaps(alignedSentence, docDate, timeIndex); if (timeExpressions != null) { Sharpen.Collections.AddAll(allTimeExpressions, timeExpressions); sentence.Set(typeof(TimeAnnotations.TimexAnnotations), timeExpressions); foreach (ICoreMap timeExpression in timeExpressions) { timeExpression.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))); } } IList <ICoreMap> numbers = alignedSentence.Get(typeof(CoreAnnotations.NumerizedTokensAnnotation)); if (numbers != null) { sentence.Set(typeof(CoreAnnotations.NumerizedTokensAnnotation), numbers); Sharpen.Collections.AddAll(allNumerics, numbers); } } annotation.Set(typeof(CoreAnnotations.NumerizedTokensAnnotation), allNumerics); } else { allTimeExpressions = AnnotateSingleSentence(annotation, docDate, timeIndex); } annotation.Set(typeof(TimeAnnotations.TimexAnnotations), allTimeExpressions); }
/// <summary> /// reads in the features from a file, having already read the /// experiments /// </summary> public Features(string filename, Experiments domain) { Exception e1 = new Exception("Incorrect data file format!"); IIndex <IntPair> instanceIndex = domain.CreateIndex(); try { using (BufferedReader @in = new BufferedReader(new FileReader(filename))) { string s; while (true) { s = @in.ReadLine(); if (s.Equals("<features>")) { break; } } if (s == null) { throw e1; } s = @in.ReadLine(); if (!s.StartsWith("<fSize>")) { throw e1; } if (!s.EndsWith("</fSize>")) { throw e1; } int index1 = s.IndexOf(">"); int index2 = s.LastIndexOf("<"); string fSt = Sharpen.Runtime.Substring(s, index1 + 1, index2); System.Console.Out.WriteLine(fSt); int number = System.Convert.ToInt32(fSt); System.Console.Out.WriteLine("fSize is " + number); int[] arrIndexes = new int[maxValue]; double[] arrValues = new double[maxValue]; for (int f = 0; f < number; f++) { string line = @in.ReadLine(); int indSp = -1; int current = 0; while ((indSp = line.IndexOf(" ")) > -1) { int x = System.Convert.ToInt32(Sharpen.Runtime.Substring(line, 0, indSp)); line = Sharpen.Runtime.Substring(line, indSp + 1); indSp = line.IndexOf(" "); if (indSp == -1) { indSp = line.Length; } int y = System.Convert.ToInt32(Sharpen.Runtime.Substring(line, 0, indSp)); line = Sharpen.Runtime.Substring(line, indSp + 1); indSp = line.IndexOf(" "); if (indSp == -1) { indSp = line.Length; } double val = double.ParseDouble(Sharpen.Runtime.Substring(line, 0, indSp)); if (indSp < line.Length) { line = Sharpen.Runtime.Substring(line, indSp + 1); } arrIndexes[current] = instanceIndex.IndexOf(new IntPair(x, y)); arrValues[current] = val; current++; } int[] indValues = new int[current]; double[] values = new double[current]; for (int j = 0; j < current; j++) { indValues[j] = arrIndexes[j]; values[j] = arrValues[j]; } Feature bf = new Feature(domain, indValues, values, instanceIndex); this.Add(bf); } } } catch (Exception e) { // for f log.Warn(e); } }
public virtual IList <ICoreMap> ExtractTimeExpressionCoreMaps(ICoreMap annotation, ICoreMap docAnnotation) { SUTime.TimeIndex timeIndex; // initialized immediately below string docDate = null; if (docAnnotation != null) { timeIndex = docAnnotation.Get(typeof(TimeExpression.TimeIndexAnnotation)); if (timeIndex == null) { docAnnotation.Set(typeof(TimeExpression.TimeIndexAnnotation), timeIndex = new SUTime.TimeIndex()); } // default look for the sentence's forum post date // if it doesn't have one, back off to the document date if (annotation.Get(typeof(CoreAnnotations.SectionDateAnnotation)) != null) { docDate = annotation.Get(typeof(CoreAnnotations.SectionDateAnnotation)); } else { docDate = docAnnotation.Get(typeof(CoreAnnotations.DocDateAnnotation)); } if (docDate == null) { Calendar cal = docAnnotation.Get(typeof(CoreAnnotations.CalendarAnnotation)); if (cal == null) { if (options.verbose) { logger.Warn("WARNING: No document date specified"); } } else { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd:hh:mm:ss"); docDate = dateFormat.Format(cal.GetTime()); } } } else { timeIndex = new SUTime.TimeIndex(); } if (StringUtils.IsNullOrEmpty(docDate)) { docDate = null; } if (timeIndex.docDate == null && docDate != null) { try { // TODO: have more robust parsing of document date? docDate may not have century.... // TODO: if docDate didn't change, we can cache the parsing of the docDate and not repeat it for every sentence timeIndex.docDate = SUTime.ParseDateTime(docDate, true); } catch (Exception e) { throw new Exception("Could not parse date string: [" + docDate + "]", e); } } string sectionDate = annotation.Get(typeof(CoreAnnotations.SectionDateAnnotation)); string refDate = (sectionDate != null) ? sectionDate : docDate; return(ExtractTimeExpressionCoreMaps(annotation, refDate, timeIndex)); }
private void RunSegmentation(ICoreMap annotation) { //0 2 // A BC D E // 1 10 1 1 // 0 12 3 4 // 0, 0+1 , string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); // the original text String IList <CoreLabel> sentChars = annotation.Get(typeof(SegmenterCoreAnnotations.CharactersAnnotation)); // the way it was divided by splitCharacters if (Verbose) { log.Info("sentChars (length " + sentChars.Count + ") is " + SentenceUtils.ListToString(sentChars, StringUtils.EmptyStringArray)); } IList <CoreLabel> tokens = new List <CoreLabel>(); annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); // Run the segmenter! On the whole String. It knows not about the splitting into chars. // Can we change this to have it run directly on the already existing list of tokens. That would help, no? IList <string> words; if (!tokenizeNewline) { text = text.ReplaceAll("[\r\n]", string.Empty); words = segmenter.SegmentString(text); } else { // remove leading and trailing newlines text = text.ReplaceAll("^[\\r\\n]+", string.Empty); text = text.ReplaceAll("[\\r\\n]+$", string.Empty); // if using the sentence split on two newlines option, replace single newlines // single newlines should be ignored for segmenting if (sentenceSplitOnTwoNewlines) { text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2"); // do a second pass to handle corner case of consecutive isolated newlines // x \n x \n x text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2"); } // Run the segmenter on each line so that we don't get tokens that cross line boundaries // Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432 string[] lines = text.Split(string.Format("((?<=%1$s)|(?=%1$s))", separator)); words = new List <string>(); foreach (string line in lines) { if (separatorPattern.Matcher(line).Matches()) { // Don't segment newline tokens, keep them as-is words.Add(line); } else { Sharpen.Collections.AddAll(words, segmenter.SegmentString(line)); } } } if (Verbose) { log.Info(text + "\n--->\n" + words + " (length " + words.Count + ')'); } // Go through everything again and make the final tokens list; for loop is over segmented words int pos = 0; // This is used to index sentChars, the output from splitCharacters StringBuilder xmlBuffer = new StringBuilder(); int xmlBegin = -1; foreach (string w in words) { CoreLabel fl = sentChars[pos]; string xmlCharAnnotation = fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)); if (Verbose) { log.Info("Working on word " + w + ", sentChar " + fl.ToShorterString() + " (sentChars index " + pos + ')'); } if ("0".Equals(xmlCharAnnotation) || "beginning".Equals(xmlCharAnnotation)) { // Beginnings of plain text and other XML tags are good places to end an XML tag if (xmlBuffer.Length > 0) { // Form the XML token string xmlTag = xmlBuffer.ToString(); CoreLabel fl1 = sentChars[pos - 1]; int end = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end)); // Clean up and prepare for the next XML tag xmlBegin = -1; xmlBuffer = new StringBuilder(); } } if (!"0".Equals(xmlCharAnnotation)) { // found an XML character; fl changes inside this loop! while (fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)).Equals("whitespace")) { // Print whitespaces into the XML buffer and move on until the next non-whitespace character is found // and we're in sync with segmenter output again xmlBuffer.Append(' '); pos += 1; fl = sentChars[pos]; } xmlBuffer.Append(w); pos = AdvancePos(sentChars, pos, w); if (xmlBegin < 0) { xmlBegin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); } continue; } // remember that fl may be more than one char long (non-BMP chars like emoji), so use advancePos() fl.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1"); if (w.IsEmpty()) { if (Verbose) { log.Warn("Encountered an empty word. Shouldn't happen?"); } continue; } // [cdm 2016:] surely this shouldn't happen! int begin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); pos = AdvancePos(sentChars, pos, w); if (pos - 1 >= sentChars.Count) { log.Error("Error: on word " + w + " at position " + (pos - w.Length) + " trying to get at position " + (pos - 1)); log.Error("last element of sentChars is " + sentChars[sentChars.Count - 1]); } else { fl = sentChars[pos - 1]; int end = fl.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(w, false, begin, end)); } } // end for (go through everything again) if (xmlBuffer.Length > 0) { // Form the last XML token, if any string xmlTag = xmlBuffer.ToString(); CoreLabel fl1 = sentChars[pos - 1]; int end = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end)); } if (Verbose) { foreach (CoreLabel token in tokens) { log.Info(token.ToShorterString()); } } }
// 1. Create the input // 1.1 Create a protocol buffer // 1.2 Create the query params // 2. Create a connection // 3. Do the annotation // This method has two contracts: // 1. It should call the two relevant callbacks // 2. It must not throw an exception /// <summary>Actually try to perform the annotation on the server side.</summary> /// <remarks> /// Actually try to perform the annotation on the server side. /// This is factored out so that we can retry up to 3 times. /// </remarks> /// <param name="annotation">The annotation we need to fill.</param> /// <param name="backend">The backend we are querying against.</param> /// <param name="serverURL">The URL of the server we are hitting.</param> /// <param name="message">The message we are sending the server (don't need to recompute each retry).</param> /// <param name="tries">The number of times we've tried already.</param> private void DoAnnotation(Annotation annotation, StanfordCoreNLPClient.Backend backend, URL serverURL, byte[] message, int tries) { try { // 1. Set up the connection URLConnection connection = serverURL.OpenConnection(); // 1.1 Set authentication if (apiKey != null && apiSecret != null) { string userpass = apiKey + ":" + apiSecret; string basicAuth = "Basic " + Sharpen.Runtime.GetStringForBytes(Base64.GetEncoder().Encode(Sharpen.Runtime.GetBytesForString(userpass))); connection.SetRequestProperty("Authorization", basicAuth); } // 1.2 Set some protocol-independent properties connection.SetDoOutput(true); connection.SetRequestProperty("Content-Type", "application/x-protobuf"); connection.SetRequestProperty("Content-Length", int.ToString(message.Length)); connection.SetRequestProperty("Accept-Charset", "utf-8"); connection.SetRequestProperty("User-Agent", typeof(StanfordCoreNLPClient).FullName); switch (backend.protocol) { case "https": case "http": { // 1.3 Set some protocol-dependent properties ((HttpURLConnection)connection).SetRequestMethod("POST"); break; } default: { throw new InvalidOperationException("Haven't implemented protocol: " + backend.protocol); } } // 2. Annotate // 2.1. Fire off the request connection.Connect(); connection.GetOutputStream().Write(message); connection.GetOutputStream().Flush(); // 2.2 Await a response // -- It might be possible to send more than one message, but we are not going to do that. Annotation response = serializer.Read(connection.GetInputStream()).first; // 2.3. Copy response over to original annotation foreach (Type key in response.KeySet()) { annotation.Set(key, response.Get(key)); } } catch (Exception t) { // 3. We encountered an error -- retry if (tries < 3) { log.Warn(t); DoAnnotation(annotation, backend, serverURL, message, tries + 1); } else { throw new Exception(t); } } }
private static string NormalizeBMP(string @in, int ascii, int spaceChar, int midDot) { StringBuilder @out = new StringBuilder(); int len = @in.Length; for (int i = 0; i < len; i++) { char cp = @in[i]; if (char.IsHighSurrogate(cp)) { if (i + 1 < len) { log.Warn("ChineseUtils.normalize warning: non-BMP codepoint U+" + int.ToHexString(char.CodePointAt(@in, i)) + " in " + @in); } else { log.Warn("ChineseUtils.normalize warning: unmatched high surrogate character U+" + int.ToHexString(char.CodePointAt(@in, i)) + " in " + @in); } } Character.UnicodeBlock cub = Character.UnicodeBlock.Of(cp); if (cub == Character.UnicodeBlock.PrivateUseArea || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaA || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaB) { EncodingPrintWriter.Err.Println("ChineseUtils.normalize warning: private use area codepoint U+" + int.ToHexString(cp) + " in " + @in); } bool delete = false; switch (ascii) { case Leave: { break; } case Ascii: { if (cp >= '\uFF01' && cp <= '\uFF5E') { cp -= (char)(unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020))); } break; } case Fullwidth: { if (cp >= '\u0021' && cp <= '\u007E') { cp += (char)(unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020))); } break; } default: { throw new ArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii); } } switch (spaceChar) { case Leave: { break; } case Ascii: { if (char.IsSpaceChar(cp)) { cp = ' '; } break; } case Fullwidth: { if (char.IsSpaceChar(cp)) { cp = '\u3000'; } break; } case Delete: { if (char.IsSpaceChar(cp)) { delete = true; } break; } case DeleteExceptBetweenAscii: { char cpp = 0; if (i > 0) { cpp = @in[i - 1]; } char cpn = 0; if (i < (len - 1)) { cpn = @in[i + 1]; } // EncodingPrintWriter.out.println("cp: " + cp + "; cpp: " + cpp + "cpn: " + cpn + // "; isSpace: " + Character.isSpaceChar(cp) + "; isAsciiLHL: " + isAsciiLowHigh(cpp) + // "; isAsciiLHR: " + isAsciiLowHigh(cpn), "UTF-8"); if (char.IsSpaceChar(cp) && !(IsAsciiLowHigh(cpp) && IsAsciiLowHigh(cpn))) { delete = true; } break; } } switch (midDot) { case Leave: { break; } case Normalize: { if (IsMidDot(cp)) { cp = '\u00B7'; } break; } case Fullwidth: { if (IsMidDot(cp)) { cp = '\u30FB'; } break; } case Delete: { if (IsMidDot(cp)) { delete = true; } break; } default: { throw new ArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot); } } if (!delete) { @out.Append(cp); } } // end for return(@out.ToString()); }