public static Parse[] ParseLine(string line, IParser parser, int numParses) { line = untokenizedParentPattern1.Replace(line, "$1 $2"); line = untokenizedParentPattern2.Replace(line, "$1 $2"); var str = new StringTokenizer(line); var sb = new StringBuilder(); var tokens = new List <string>(); while (str.HasMoreTokens) { var tok = str.NextToken; tokens.Add(tok); sb.Append(tok).Append(" "); } var text = sb.ToString(0, sb.Length - 1); var p = new Parse(text, new Span(0, text.Length), AbstractBottomUpParser.INC_NODE, 0, 0); var start = 0; for (var i = 0; i < tokens.Count; i++) { p.Insert(new Parse(text, new Span(start, start + tokens[i].Length), AbstractBottomUpParser.TOK_NODE, 0, i)); start += tokens[i].Length + 1; } return(numParses == 1 ? new[] { parser.Parse(p) } : parser.Parse(p, numParses)); }
/// <summary> /// Gets the outcome patterns. /// </summary> /// <returns>System.Int32[][].</returns> protected int[][] GetOutcomePatterns() { var numOCTypes = ReadInt(); var outcomePatterns = new int[numOCTypes][]; for (var i = 0; i < numOCTypes; i++) { var tok = new StringTokenizer(ReadString(), " "); var infoInts = new int[tok.CountTokens]; for (var j = 0; tok.HasMoreTokens; j++) { infoInts[j] = int.Parse(tok.NextToken); } outcomePatterns[i] = infoInts; } return(outcomePatterns); }
/// <summary> /// Returns the next object. Calling this method repeatedly until it returns , /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public virtual Event Read() { string line = Reader.ReadLine(); if (line == null) { return(null); } var st = new StringTokenizer(line); string outcome = st.NextToken; var count = st.CountTokens; var context = new string[count]; for (int i = 0; i < count; i++) { context[i] = st.NextToken; } return(new Event(outcome, context)); }
public static Parse[] ParseLine(string line, IParser parser, int numParses) { line = untokenizedParentPattern1.Replace(line, "$1 $2"); line = untokenizedParentPattern2.Replace(line, "$1 $2"); var str = new StringTokenizer(line); var sb = new StringBuilder(); var tokens = new List<string>(); while (str.HasMoreTokens) { var tok = str.NextToken; tokens.Add(tok); sb.Append(tok).Append(" "); } var text = sb.ToString(0, sb.Length - 1); var p = new Parse(text, new Span(0, text.Length), AbstractBottomUpParser.INC_NODE, 0, 0); var start = 0; for (var i = 0; i < tokens.Count; i++) { p.Insert(new Parse(text, new Span(start, start + tokens[i].Length), AbstractBottomUpParser.TOK_NODE, 0, i)); start += tokens[i].Length + 1; } return numParses == 1 ? new[] { parser.Parse(p) } : parser.Parse(p, numParses); }
private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <string> tags, List <string> con, List <string> prop) { if (leaf == null) { return; } var lexeme = leaf.Lexeme; // this will change half of the quotation marks if ("«" == lexeme || "»" == lexeme) { if (callsCount % 2 == 0) { lexeme = "\""; } } var tag = leaf.FunctionalTag; string contraction = null; if (leaf.SecondaryTag != null) { if (leaf.SecondaryTag.Contains("<sam->")) { contraction = "B"; } else if (leaf.SecondaryTag.Contains("<-sam>")) { contraction = "E"; } } if (tag == null) { tag = lexeme; } if (includeFeatures && !string.IsNullOrEmpty(leaf.MorphologicalTag)) { tag += " " + leaf.MorphologicalTag; } tag = tag.RegExReplace(Expressions.Expression.Space, "=") ?? lexeme; //tag = tag.replaceAll("\\s+", "="); if (expandMe && lexeme.Contains("_")) { var tokenizer = new StringTokenizer(lexeme, "_"); if (tag == "prop") { sentence.Add(lexeme); tags.Add(tag); con.Add(null); prop.Add("P"); } else if (tokenizer.CountTokens > 0) { var toks = new List <string>(tokenizer.CountTokens); var tagsWithCont = new List <string>(tokenizer.CountTokens); toks.Add(tokenizer.NextToken); tagsWithCont.Add("B-" + tag); while (tokenizer.HasMoreTokens) { toks.Add(tokenizer.NextToken); tagsWithCont.Add("I-" + tag); } if (contraction != null) { con.AddRange(new string[toks.Count - 1]); con.Add(contraction); } else { con.AddRange(new string[toks.Count]); } sentence.AddRange(toks); tags.AddRange(tagsWithCont); prop.AddRange(new string[toks.Count]); } else { sentence.Add(lexeme); tags.Add(tag); prop.Add(null); con.Add(contraction); } } else if (lexeme.Contains(hyphen) && lexeme.Length > 1) { string firstTok = null; string secondTok = null; string rest = null; var match = hyphenRegex.Match(lexeme); if (match.Success) { if (match.Groups[1].Success) { firstTok = match.Groups[2].Value; } else if (match.Groups[3].Success) { secondTok = match.Groups[4].Value; rest = match.Groups[5].Value; } else if (match.Groups[6].Success) { firstTok = match.Groups[7].Value; secondTok = match.Groups[8].Value; rest = match.Groups[9].Value; } else { throw new InvalidFormatException("Wrong hyphen pattern."); } if (!string.IsNullOrEmpty(firstTok)) { sentence.Add(firstTok); tags.Add(tag); prop.Add(null); con.Add(contraction); } if (!string.IsNullOrEmpty(hyphen)) { sentence.Add(hyphen); tags.Add(hyphen); prop.Add(null); con.Add(contraction); } if (!string.IsNullOrEmpty(secondTok)) { sentence.Add(secondTok); tags.Add(tag); prop.Add(null); con.Add(contraction); } if (!string.IsNullOrEmpty(rest)) { sentence.Add(rest); tags.Add(tag); prop.Add(null); con.Add(contraction); } } else { sentence.Add(lexeme); tags.Add(tag); prop.Add(null); con.Add(contraction); } } else { tag = AddGender(tag, leaf.MorphologicalTag); sentence.Add(lexeme); tags.Add(tag); prop.Add(null); con.Add(contraction); } }