public bool IsMatch(string sentence, PhrasalVerb phrasalVerb) { var tokens = tokenizer.Tokenize(sentence); var matchRoot = false; var particleToMatch = 0; for (var i = 0; i < tokens.Length; i++) { var token = tokens[i]; if (!matchRoot) { // try to match the root first matchRoot = string.Equals(token, phrasalVerb.Root, StringComparison.InvariantCultureIgnoreCase) || string.Equals(lemmatizer.Lemmatize(token), phrasalVerb.Root, StringComparison.InvariantCultureIgnoreCase); } else { // match all particles if (phrasalVerb.Particles.Count > particleToMatch) { var particle = phrasalVerb.Particles[particleToMatch]; var isMatch = string.Equals(token, particle, StringComparison.InvariantCultureIgnoreCase); if (isMatch) { particleToMatch++; if (particleToMatch >= phrasalVerb.Particles.Count) { // we matched all particles return(true); } } } } } // if we get here, matching failed return(false); }
public bool IsMatch(List <string> tokens, PhrasalVerb pv) { try { // create parse tree var parse = parser.DoParse(tokens); // compute dependencies between words for this sentence var dependencies = ComputeDependencies(parse).ToList(); // get relevant dependencies found var parts = pv.Name.Split(' ').ToList(); var root = parts.First(); // find dependencies for this root var rootRelatedDependencies = dependencies .Where(d => // the (lemmatized) token must be equal to the gov/dep of the dependency ((string.Equals(root, lemmatizer.Lemmatize(d.Gov().GetWord()), StringComparison.InvariantCultureIgnoreCase) && d.Gov().Index() < d.Dep().Index()) || (string.Equals(root, lemmatizer.Lemmatize(d.Dep().GetWord()), StringComparison.InvariantCultureIgnoreCase) && d.Dep().Index() < d.Gov().Index())) // if the phrasal verb is inseparable, no word must be between the root and the particle && (!pv.Inseparable.HasValue || (!pv.Inseparable.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) == 1)) // if the phrasal verb is mandatory seprable, at least one word must be between the root and the particle && (!pv.SeparableMandatory.HasValue || (!pv.SeparableMandatory.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) > 1)) ) .ToList(); // We take only the 2nd part // For phrasal verbs with several particles, that's a good approximation for now // (we could check that all the particles are also linked) if (rootRelatedDependencies.Any() && parts.Count() > 1) { var particle1 = parts[1]; var relevantDependencies = rootRelatedDependencies.Where(d => d.Reln().GetShortName() == "prt").ToList(); if (!relevantDependencies.Any()) { // if no "prt" relation, take all relations whatsoever. relevantDependencies = rootRelatedDependencies; } // if one of relevant dependencies have the particle as gov/dep, it's good! var rootParticle1Dependency = relevantDependencies .FirstOrDefault(d => string.Equals(particle1, d.Dep().GetWord(), StringComparison.InvariantCultureIgnoreCase) || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase)); if (rootParticle1Dependency != null) { if (parts.Count <= 2) { // phrasal verb has 1 particle only; we're done return(true); } else { // otherwise, check that the other particles are in the sentence (approximation) var lastTokenIndex = Math.Max(rootParticle1Dependency.Gov().Index(), rootParticle1Dependency.Dep().Index()) - 1; var endOfSentenceTokens = tokens.Skip(lastTokenIndex).ToList(); return(parts.Skip(2).All(endOfSentenceTokens.Contains)); } } } } catch (Exception ex) { // fail silently Console.WriteLine("Exception raised when trying to match '{0}' in '{1}'", pv, string.Join(" ", tokens)); } // if we get here, matching failed return(false); }
public bool IsMatch(string sentence, PhrasalVerb pv) { var tokens = tokenizer.Tokenize(sentence).ToList(); return(IsMatch(tokens, pv)); }