static LogParser() { var parsers = new List <LogSectionParser>(LogSections.Count); foreach (var sectionDescription in LogSections) { var parser = new LogSectionParser { OnLineCheckAsync = sectionDescription.OnNewLineAsync ?? ((l, s) => Task.CompletedTask), OnSectionEnd = sectionDescription.OnSectionEnd, EndTrigger = sectionDescription.EndTrigger.ToLatin8BitEncoding(), }; // the idea here is to construct Aho-Corasick parser that will look for any data marker and run the associated regex to extract the data into state if (sectionDescription.Extractors?.Count > 0) { var act = new AhoCorasickDoubleArrayTrie <Action <string, LogParseState> >(sectionDescription.Extractors.Select(extractorPair => new SectionAction( extractorPair.Key.ToLatin8BitEncoding(), (buffer, state) => OnExtractorHit(buffer, extractorPair.Value, state) ) ), true); parser.OnExtract = (line, buffer, state) => { act.ParseText(line, h => { h.Value(buffer, state); }); }; } parsers.Add(parser); } SectionParsers = parsers.AsReadOnly(); }
static bool saveDat(String path, AhoCorasickDoubleArrayTrie <String> trie, HashSet <KeyValuePair <String, String> > entrySet) { //try //{ // DataOutputStream out = new DataOutputStream(new FileOutputStream(path + Predefine.BIN_EXT)); //out.writeInt(entrySet.size()); // for (Map.Entry<String, String> entry : entrySet) // { // char[] charArray = entry.getValue().toCharArray(); // out.writeInt(charArray.length); // for (char c : charArray) // { // out.writeChar(c); // } // } // trie.save(out); //out.close(); //} //catch (Exception e) //{ // logger.warning("缓存值dat" + path + "失败"); // return false; //} return(true); }
protected static String segLongest(char[] charArray, AhoCorasickDoubleArrayTrie <String> trie) { // final String[] wordNet = new String[charArray.length]; // final int[] lengthNet = new int[charArray.length]; // trie.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<String>() // { // @Override // public void hit(int begin, int end, String value) // { // int length = end - begin; // if (length > lengthNet[begin]) // { // wordNet[begin] = value; // lengthNet[begin] = length; // } // } // }); // StringBuilder sb = new StringBuilder(charArray.length); // for (int offset = 0; offset<wordNet.length; ) // { // if (wordNet[offset] == null) // { // sb.append(charArray[offset]); // ++offset; // continue; // } //sb.append(wordNet[offset]); // offset += lengthNet[offset]; // } // return sb.toString(); return(null); }
static bool loadDat(String path, AhoCorasickDoubleArrayTrie <String> trie) { //ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT); //if (byteArray == null) return false; //int size = byteArray.nextInt(); //String[] valueArray = new String[size]; //for (int i = 0; i < valueArray.length; ++i) //{ // valueArray[i] = byteArray.nextString(); //} //trie.load(byteArray, valueArray); return(true); }
public void TestAhoCorasickDoubleArrayTrieForSingleLine(string line) { var acdat = new AhoCorasickDoubleArrayTrie <string>(); var pairs = SkillSetMapper.SkillSet.Select((k, i) => new KeyValuePair <string, string>(k, i.ToString())); acdat.Build(pairs, true); var collectedValues = new List <string>(); acdat.ParseText(line, hit => { collectedValues.Add(hit.Value); return(true); }); Assert.IsNotEmpty(collectedValues); var collectedValuesresult = collectedValues.Where(i => SkillSetMapper.SkillSet.ElementAtOrDefault(int.Parse(i)) == null); Assert.IsEmpty(collectedValuesresult); var keyWord = SkillSetMapper.SkillSet[int.Parse(collectedValues.FirstOrDefault())].Trim(); Assert.True(keyWord == "MATLAB"); }
public static void RebuildMatcher() { var newFilters = new Dictionary <FilterContext, AhoCorasickDoubleArrayTrie <Piracystring> >(); using (var db = new BotDb()) foreach (FilterContext ctx in Enum.GetValues(typeof(FilterContext))) { var f = db.Piracystring.Where(ps => ps.Disabled == false && ps.Context.HasFlag(ctx)).AsNoTracking().ToList(); if (f.Count == 0) { newFilters[ctx] = null; } else { try { newFilters[ctx] = new AhoCorasickDoubleArrayTrie <Piracystring>(f.ToDictionary(s => s.String, s => s), true); } catch (ArgumentException) { var duplicate = ( from ps in f group ps by ps.String into g where g.Count() > 1 select g.Key ).ToList(); Config.Log.Error($"Duplicate triggers defined for Context {ctx}: {string.Join(", ", duplicate)}"); var triggerDictionary = new Dictionary <string, Piracystring>(); foreach (var ps in f) { triggerDictionary[ps.String] = ps; } newFilters[ctx] = new AhoCorasickDoubleArrayTrie <Piracystring>(triggerDictionary, true); } } } filters = newFilters; }
// In order this test to run, you need to create a Resumes folder in test execution directory and put some test resumés. // Only technical skill matching is set up, to test with technical profiles. public void TestAhoCorasickDoubleArrayTrieForManyResumes() { var processor = new ResumeProcessor(new JsonOutputFormatter()); var filePaths = Directory.GetFiles("Resumes").Select(Path.GetFullPath); var acdat = new AhoCorasickDoubleArrayTrie <string>(); var pairs = SkillSetMapper.SkillSet.Select((k, i) => new KeyValuePair <string, string>(k, i.ToString())); acdat.Build(pairs, true); foreach (var filePath in filePaths) { var fileName = Path.GetFileName(filePath); var rawInput = processor._inputReaders.ReadIntoList(filePath); var collectedValues = new List <string>(); foreach (var line in rawInput) { acdat.ParseText(line, hit => { collectedValues.Add(hit.Value); return(true); }); } Assert.IsNotEmpty(collectedValues, $"No match found in file: {filePath}"); } }
private static void AhoCorasickDoubleArrayTrieSearch(List <string> list, string txt) { var keywords = new Dictionary <string, string>(); for (int i = 0; i < list.Count; i++) { keywords[list[i]] = list[i]; } var matcher = new AhoCorasickDoubleArrayTrie <string>(keywords); var fs = File.OpenWrite("AhoCorasickDoubleArrayTrie.dat"); matcher.Save(fs, true); fs.Close(); Stopwatch watch = new Stopwatch(); watch.Start(); for (int i = 0; i < 100000; i++) { matcher.ParseText(txt); } watch.Stop(); Console.WriteLine(" AhoCorasickDoubleArrayTrie: " + watch.ElapsedMilliseconds.ToString("N0") + "ms"); }
/** * 读取词典 * @param path * @param trie * @param reverse 是否将其翻转 * @return */ static bool load(String path, AhoCorasickDoubleArrayTrie <String> trie, bool reverse) { //String datPath = path; //if (reverse) //{ // datPath += Predefine.REVERSE_EXT; //} //if (loadDat(datPath, trie)) return true; //// 从文本中载入并且尝试生成dat //StringDictionary dictionary = new StringDictionary("="); //if (!dictionary.load(path)) return false; //if (reverse) dictionary = dictionary.reverse(); //HashSet<KeyValuePair<String, String>> entrySet = dictionary; //dictionary<String, String> map = new Dictionary<String, String>(); //for (Map.Entry<String, String> entry : entrySet) //{ // map.put(entry.getKey(), entry.getValue()); //} //logger.info("正在构建AhoCorasickDoubleArrayTrie,来源:" + path); //trie.build(map); //logger.info("正在缓存双数组" + datPath); //saveDat(datPath, trie, entrySet); return(true); }
private static bool EntitiesReader(string temp_txt_Location, string length, string dllPath, Dictionary <string, int> entityMissDic1, Dictionary <string, int> entityMissDic2, bool flag3) { StreamReader sr = new StreamReader(temp_txt_Location); var line = sr.ReadToEnd(); int currentCount = 0; var keywords = new Dictionary <string, int>() { { ".class", 0 }, { ".method", 0 }, { "interface", 0 }, { ".property", 0 }, { ".assembly", 0 }, }; var matcher = new AhoCorasickDoubleArrayTrie <int>(keywords); var text = line; matcher.ParseText(text, (hit) => { switch (text.Substring(hit.Begin, hit.Length)) { case ".class": { keywords.TryGetValue(".class", out currentCount); keywords[".class"] = currentCount + 1; break; } case ".method": { keywords.TryGetValue(".method", out currentCount); keywords[".method"] = currentCount + 1; break; } case "interface": { keywords.TryGetValue("interface", out currentCount); keywords["interface"] = currentCount + 1; break; } case ".property": { keywords.TryGetValue(".property", out currentCount); keywords[".property"] = currentCount + 1; break; } case ".assembly": { keywords.TryGetValue(".assembly", out currentCount); keywords[".assembly"] = currentCount + 1; break; } default: { break; } } }); if (flag3) { entityMissDic1[".class"] = keywords[".class"]; entityMissDic1[".method"] = keywords[".method"]; entityMissDic1["interface"] = keywords["interface"]; entityMissDic1[".property"] = keywords[".property"]; entityMissDic1[".assembly"] = keywords[".assembly"]; entityMissDic1["length"] = Convert.ToInt32(length); } else { entityMissDic2[".class"] = keywords[".class"]; entityMissDic2[".method"] = keywords[".method"]; entityMissDic2["interface"] = keywords["interface"]; entityMissDic2[".property"] = keywords[".property"]; entityMissDic2[".assembly"] = keywords[".assembly"]; entityMissDic2["length"] = Convert.ToInt32(length); } //close the file sr.Close(); return(true); }
private static void RebuildMatcher() { matcher = PiracyStrings.Count == 0 ? null : new AhoCorasickDoubleArrayTrie <string>(PiracyStrings.ToDictionary(s => s, s => s), true); }
public SkillSetMapper() { __skillSetMatcher = new AhoCorasickDoubleArrayTrie <string>(); __skillSetMatcher.Build(SkillSet.Select((k, i) => new KeyValuePair <string, string>(k, i.ToString())), true); }
/** * 将path的内容载入trie中 * @param path * @param trie * @return */ static bool load(String path, AhoCorasickDoubleArrayTrie <String> trie) { return(load(path, trie, false)); }