public void AddTokensFromFile(string file, TokenizerType tokType) { Utils.ThrowException(!Utils.VerifyFileNameOpen(file) ? new ArgumentValueException("file") : null); Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null); StreamReader reader = Utils.GetUnicodeSignature(file) != null ? new StreamReader(file) : new StreamReader(file, Encoding.UTF8); SimpleTokenizer tokenizer = new SimpleTokenizer(); tokenizer.Type = tokType; string line; while ((line = reader.ReadLine()) != null) { tokenizer.Text = line; foreach (string token in tokenizer) { AddToken(token.ToUpper()); } } reader.Close(); }
public void AddTokensFromString(string str, TokenizerType tokType) { Utils.ThrowException(str == null ? new ArgumentNullException("str") : null); Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null); SimpleTokenizer tokenizer = new SimpleTokenizer(); tokenizer.Type = tokType; tokenizer.Text = str; AddTokens(tokenizer); }
public static Dictionary<string, int> CountTokens(IEnumerable<string> articles) { Tokenizer t = new SimpleTokenizer(); //TokenizerME m = new TokenizerME(new TokenizerModel("e // List<String> terms = null; // Uri uri = new Uri("/terms.txt", UriKind.Relative); //StreamResourceInfo info = Application.GetResourceStream(uri); //using (var reader = new StreamReader(info.Stream)) //{ // var termsFile = reader.ReadToEnd(); // terms = termsFile.Split().ToList(); //} var words = new Dictionary<string, int>(); foreach (var article in articles) { //var wordPattern = new Regex(@"\w+", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace); var tokens = t.tokenize(article); foreach (var token in tokens) { int currentCount = 0; words.TryGetValue(token, out currentCount); words[token] = ++currentCount; } ////foreach (Match match in wordPattern.Matches(article)) //{ // //var word = match.Value.ToLower(); // //if (terms.Contains(word)) // { // int currentCount = 0; // words.TryGetValue(word, out currentCount); // words[word] = ++currentCount; // } //} } return words.Where(p => p.Value > 2).ToDictionary(p => p.Key, p => p.Value); }
public override List <ExtractResult> Extract(string text) { var result = new List <ExtractResult>(); if (string.IsNullOrEmpty(text)) { return(result); } var matchSource = new Dictionary <Match, string>(); var matched = new bool[text.Length]; var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value); foreach (var collection in collections) { foreach (Match m in collection.Key) { for (var j = 0; j < m.Length; j++) { matched[m.Index + j] = true; } // Keep Source Data for extra information matchSource.Add(m, collection.Value); } } var lastNotMatched = -1; for (var i = 0; i < text.Length; i++) { if (matched[i]) { if (i + 1 == text.Length || !matched[i + 1]) { var start = lastNotMatched + 1; var length = i - lastNotMatched; var substr = text.Substring(start, length); if (substr.StartsWith(Constants.IPV6_ELLIPSIS, StringComparison.Ordinal) && (start > 0 && char.IsLetterOrDigit(text[start - 1]) && !SimpleTokenizer.IsCjk(text[start - 1]))) { continue; } if (substr.EndsWith(Constants.IPV6_ELLIPSIS, StringComparison.Ordinal) && (i + 1 < text.Length && char.IsLetterOrDigit(text[i + 1]) && !SimpleTokenizer.IsCjk(text[start + 1]))) { continue; } bool MatchFunc(Match o) => o.Index == start && o.Length == length; if (matchSource.Keys.Any(MatchFunc)) { var srcMatch = matchSource.Keys.First(MatchFunc); result.Add(new ExtractResult { Start = start, Length = length, Text = substr, Type = ExtractType, Data = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null, }); } } } else { lastNotMatched = i; } } return(result); }
private void MergePureNumber(string source, List <ExtractResult> ers) { var numErs = config.UnitNumExtractor.Extract(source); var unitNumbers = new List <ExtractResult>(); for (int i = 0, j = 0; i < numErs.Count; i++) { bool hasBehindExtraction = false; while (j < ers.Count && ers[j].Start + ers[j].Length < numErs[i].Start) { hasBehindExtraction = true; j++; } if (!hasBehindExtraction) { continue; } // Filter cases like "1 dollars 11a", "11" is not the fraction here. if (source.Length > numErs[i].Start + numErs[i].Length) { var endChar = source.Substring(numErs[i].Length + numErs[i].Start ?? 0, 1); if (char.IsLetter(endChar[0]) && !SimpleTokenizer.IsCjk(endChar[0])) { continue; } } var middleBegin = ers[j - 1].Start + ers[j - 1].Length ?? 0; var middleEnd = numErs[i].Start ?? 0; var middleStr = source.Substring(middleBegin, middleEnd - middleBegin).Trim().ToLowerInvariant(); // Separated by whitespace if (string.IsNullOrEmpty(middleStr)) { unitNumbers.Add(numErs[i]); continue; } // Separated by connectors var match = config.CompoundUnitConnectorRegex.Match(middleStr); if (match.Success && match.Index == 0 && match.Length == middleStr.Length) { unitNumbers.Add(numErs[i]); } } foreach (var extractResult in unitNumbers) { var overlap = false; foreach (var er in ers) { if (er.Start <= extractResult.Start && er.Start + er.Length >= extractResult.Start) { overlap = true; } } if (!overlap) { ers.Add(extractResult); } } ers.Sort((x, y) => x.Start - y.Start ?? 0); }
/// <summary> /// コマンド の entry point /// </summary> /// <param name="args">コマンドライン引数</param> static void Main(string[] args) { bool tokenizeOnly = false; // tokenize だけで終わるかどうか bool parseOnly = false; // parse だけで終わるかどうか bool runAsCalcurator = false; // 計算機として動く string src = null; // 直接実行するソース // 引数をparseする var rest = new List <string>(); for (int i = 0; i < args.Length; i++) { var arg = args[i]; switch (arg) { case "-h": case "--help": showHelpAndExit(); break; case "-t": case "--tokenize": tokenizeOnly = true; break; case "-p": case "--parse": parseOnly = true; break; case "-d": case "--debug": Logger.LogEnabled = true; break; case "-e": src = args[i + 1]; i++; break; case "-c": case "--calc": runAsCalcurator = true; src = args[i + 1]; i++; break; default: rest.Add(arg); break; } } // 各実行器を用意する var tokenizer = new SimpleTokenizer(); var parser = new Parser(); var interpreter = new Interpreter(); // ソースファイルを読み込む if (src == null) { // 引数がないなら、ヘルプを表示して終わる if (rest.Count != 1) { showHelpAndExit(); } src = File.ReadAllText(rest[0]); } // Tokenize を行う var tokens = tokenizer.Tokenize(src); if (tokenizeOnly) { Console.WriteLine(string.Join(" ", tokens.Select(t => t.Text).ToArray())); exit(0); } // Parse を行う MyLang.Ast.Ast ast; if (runAsCalcurator) { ast = parser.ParseCalcurator(tokens); } else { ast = parser.Parse(tokens); } if (parseOnly) { Console.WriteLine(new MyLang.AstDisplayer().BuildString(ast, false)); exit(0); } // Interpreter で実行する interpreter.Run(ast); exit(0); }
public void Initialize() { m_Tokenizer = new SimpleTokenizer(',', ' ', '!'); }