public void Preprocess(TripletTrain result) { if (ids.Contains(result.PredicateId)) { result.Text = null; } }
public void Preprocess(TripletTrain result) { if (result.Text.Length > 1000) { result.Text = null; } }
public void Preprocess(TripletTrain result) { if (symbols.IsMatch(result.Text)) { result.Text = null; } }
public void Preprocess(TripletTrain result) { var lineBreaksCount = result.Text.Count(c => c == '\n'); if (lineBreaksCount > 0) { result.Text = null; } }
public void Preprocess(TripletTrain result) { var subjectHere = result.Text.Contains(result.SubjectAnchor); var objectHere = result.Text.Contains(result.ObjectAnchor); if (!subjectHere || !objectHere) { result.Text = null; } }
public void Preprocess(TripletTrain result) { var textLength = result.Text.Length; //var minimumLength = result.Object.Length + 8 + result.Subject.Length; // magic numbers woohoo if (textLength < _min || textLength > _max) { result.Text = null; } }
public void Preprocess(TripletTrain result) { var t = r.Replace(result.Text, ""); if (string.IsNullOrEmpty(t)) { result.Text = null; } else if (r2.Matches(result.Text).Count > 6) { result.Text = null; } }
public void Preprocess(TripletTrain result) { var hash = (result.Object + result.Subject + result.PredicateId + result.Text).GetHashCode(); if (tripletHashes.Contains(hash)) { result.Text = null; } else { tripletHashes.Add(hash); } }
private string GoThroughRules(string text) { var triplet = new TripletTrain { Text = text }; foreach (var rule in rules) { rule.Preprocess(triplet); if (string.IsNullOrEmpty(triplet.Text)) { break; } } return(triplet.Text); }
public async Task <int> PreprocessTrain(Dictionary <string, int> stats) { var counter = 0; var triplets = _triplets.Find(t => t.ArticlePositions != null).ToCursor(); await triplets.ForEachAsync(async t => { foreach (var p in t.ArticlePositions) { var tr = new TripletTrain { Object = t.SubjectWikiName, ObjectAnchor = p.SubjectPosition.Anchor, Subject = t.ObjectWikiName, SubjectAnchor = p.ObjectPosition.Anchor, Predicate = _properties.First(pr => pr.WikidataId == t.Property).ReadTitleUk, PredicateId = t.Property, Text = p.Text, WikipediaLink = "https://uk.wikipedia.org/wiki/" + p.ArticleTitle, WikipediaTitle = p.ArticleTitle }; foreach (var rule in _rules) { rule.Preprocess(tr); if (string.IsNullOrEmpty(tr.Text)) { stats[rule.GetType().Name] += 1; break; } } if (string.IsNullOrEmpty(tr.Text)) { continue; } counter++; await _tripletsTrain.InsertOneAsync(tr); } }); return(counter); }
public void Preprocess(TripletTrain result) { result.Text = link.Replace(result.Text, "$1$2$3"); }
public void Preprocess(TripletTrain result) { result.Text = r.Replace(result.Text, ""); }
public void Preprocess(TripletTrain result) { result.Text = regex.Replace(result.Text, ""); result.Text = regex2.Replace(result.Text, "("); result.Text = regex3.Replace(result.Text, ")"); }
public void Preprocess(TripletTrain result) { result.Text = result.Text.Trim(); }
public void Preprocess(TripletTrain result) { result.Id = Guid.NewGuid().ToString(); }
public void Preprocess(TripletTrain result) { var objDistance = LevenshteinDistanceNormalized(result.Object, result.ObjectAnchor); var subjDistance = LevenshteinDistanceNormalized(result.Subject, result.SubjectAnchor); }