Example #1
0
 public void AddTokensFromFile(string file, TokenizerType tokType)
 {
     Utils.ThrowException(!Utils.VerifyFileNameOpen(file) ? new ArgumentValueException("file") : null);
     Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null);
     StreamReader reader = Utils.GetUnicodeSignature(file) != null ? new StreamReader(file) : new StreamReader(file, Encoding.UTF8);
     SimpleTokenizer tokenizer = new SimpleTokenizer();
     tokenizer.Type = tokType;
     string line;
     while ((line = reader.ReadLine()) != null)
     {
         tokenizer.Text = line;
         foreach (string token in tokenizer)
         {
             AddToken(token.ToUpper());
         }
     }
     reader.Close();
 }
Example #2
0
 public void AddTokensFromString(string str, TokenizerType tokType)
 {
     Utils.ThrowException(str == null ? new ArgumentNullException("str") : null);
     Utils.ThrowException(mIsRanked ? new InvalidOperationException() : null);
     SimpleTokenizer tokenizer = new SimpleTokenizer();
     tokenizer.Type = tokType;
     tokenizer.Text = str;
     AddTokens(tokenizer);
 }
Example #3
0
        public static Dictionary<string, int> CountTokens(IEnumerable<string> articles)
        {
            Tokenizer t = new SimpleTokenizer();
            //TokenizerME m = new TokenizerME(new TokenizerModel("e

            // List<String> terms = null;
            // Uri uri = new Uri("/terms.txt", UriKind.Relative);
            //StreamResourceInfo info = Application.GetResourceStream(uri);
            //using (var reader = new StreamReader(info.Stream))
            //{
            //    var termsFile = reader.ReadToEnd();
            //    terms = termsFile.Split().ToList();
            //}

            var words = new Dictionary<string, int>();
            foreach (var article in articles)
            {
                //var wordPattern = new Regex(@"\w+", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);

                var tokens = t.tokenize(article);

                foreach (var token in tokens)
                {
                    int currentCount = 0;
                    words.TryGetValue(token, out currentCount);

                    words[token] = ++currentCount;
                }

                ////foreach (Match match in wordPattern.Matches(article))
                //{
                //    //var word = match.Value.ToLower();
                //    //if (terms.Contains(word))
                //    {
                //        int currentCount = 0;
                //        words.TryGetValue(word, out currentCount);

                //        words[word] = ++currentCount;
                //    }
                //}
            }

            return words.Where(p => p.Value > 2).ToDictionary(p => p.Key, p => p.Value);
        }
        public override List <ExtractResult> Extract(string text)
        {
            var result = new List <ExtractResult>();

            if (string.IsNullOrEmpty(text))
            {
                return(result);
            }

            var matchSource = new Dictionary <Match, string>();
            var matched     = new bool[text.Length];

            var collections = Regexes.ToDictionary(o => o.Key.Matches(text), p => p.Value);

            foreach (var collection in collections)
            {
                foreach (Match m in collection.Key)
                {
                    for (var j = 0; j < m.Length; j++)
                    {
                        matched[m.Index + j] = true;
                    }

                    // Keep Source Data for extra information
                    matchSource.Add(m, collection.Value);
                }
            }

            var lastNotMatched = -1;

            for (var i = 0; i < text.Length; i++)
            {
                if (matched[i])
                {
                    if (i + 1 == text.Length || !matched[i + 1])
                    {
                        var start  = lastNotMatched + 1;
                        var length = i - lastNotMatched;
                        var substr = text.Substring(start, length);
                        if (substr.StartsWith(Constants.IPV6_ELLIPSIS, StringComparison.Ordinal) &&
                            (start > 0 && char.IsLetterOrDigit(text[start - 1]) && !SimpleTokenizer.IsCjk(text[start - 1])))
                        {
                            continue;
                        }

                        if (substr.EndsWith(Constants.IPV6_ELLIPSIS, StringComparison.Ordinal) &&
                            (i + 1 < text.Length && char.IsLetterOrDigit(text[i + 1]) && !SimpleTokenizer.IsCjk(text[start + 1])))
                        {
                            continue;
                        }

                        bool MatchFunc(Match o) => o.Index == start && o.Length == length;

                        if (matchSource.Keys.Any(MatchFunc))
                        {
                            var srcMatch = matchSource.Keys.First(MatchFunc);
                            result.Add(new ExtractResult
                            {
                                Start  = start,
                                Length = length,
                                Text   = substr,
                                Type   = ExtractType,
                                Data   = matchSource.ContainsKey(srcMatch) ? matchSource[srcMatch] : null,
                            });
                        }
                    }
                }
                else
                {
                    lastNotMatched = i;
                }
            }

            return(result);
        }
Example #5
0
        private void MergePureNumber(string source, List <ExtractResult> ers)
        {
            var numErs = config.UnitNumExtractor.Extract(source);

            var unitNumbers = new List <ExtractResult>();

            for (int i = 0, j = 0; i < numErs.Count; i++)
            {
                bool hasBehindExtraction = false;
                while (j < ers.Count && ers[j].Start + ers[j].Length < numErs[i].Start)
                {
                    hasBehindExtraction = true;
                    j++;
                }

                if (!hasBehindExtraction)
                {
                    continue;
                }

                // Filter cases like "1 dollars 11a", "11" is not the fraction here.
                if (source.Length > numErs[i].Start + numErs[i].Length)
                {
                    var endChar = source.Substring(numErs[i].Length + numErs[i].Start ?? 0, 1);
                    if (char.IsLetter(endChar[0]) && !SimpleTokenizer.IsCjk(endChar[0]))
                    {
                        continue;
                    }
                }

                var middleBegin = ers[j - 1].Start + ers[j - 1].Length ?? 0;
                var middleEnd   = numErs[i].Start ?? 0;

                var middleStr = source.Substring(middleBegin, middleEnd - middleBegin).Trim().ToLowerInvariant();

                // Separated by whitespace
                if (string.IsNullOrEmpty(middleStr))
                {
                    unitNumbers.Add(numErs[i]);
                    continue;
                }

                // Separated by connectors
                var match = config.CompoundUnitConnectorRegex.Match(middleStr);
                if (match.Success && match.Index == 0 && match.Length == middleStr.Length)
                {
                    unitNumbers.Add(numErs[i]);
                }
            }

            foreach (var extractResult in unitNumbers)
            {
                var overlap = false;
                foreach (var er in ers)
                {
                    if (er.Start <= extractResult.Start && er.Start + er.Length >= extractResult.Start)
                    {
                        overlap = true;
                    }
                }

                if (!overlap)
                {
                    ers.Add(extractResult);
                }
            }

            ers.Sort((x, y) => x.Start - y.Start ?? 0);
        }
Example #6
0
    /// <summary>
    /// コマンド の entry point
    /// </summary>
    /// <param name="args">コマンドライン引数</param>
    static void Main(string[] args)
    {
        bool   tokenizeOnly    = false; // tokenize だけで終わるかどうか
        bool   parseOnly       = false; // parse だけで終わるかどうか
        bool   runAsCalcurator = false; // 計算機として動く
        string src             = null;  // 直接実行するソース

        // 引数をparseする
        var rest = new List <string>();

        for (int i = 0; i < args.Length; i++)
        {
            var arg = args[i];
            switch (arg)
            {
            case "-h":
            case "--help":
                showHelpAndExit();
                break;

            case "-t":
            case "--tokenize":
                tokenizeOnly = true;
                break;

            case "-p":
            case "--parse":
                parseOnly = true;
                break;

            case "-d":
            case "--debug":
                Logger.LogEnabled = true;
                break;

            case "-e":
                src = args[i + 1];
                i++;
                break;

            case "-c":
            case "--calc":
                runAsCalcurator = true;
                src             = args[i + 1];
                i++;
                break;

            default:
                rest.Add(arg);
                break;
            }
        }

        // 各実行器を用意する
        var tokenizer   = new SimpleTokenizer();
        var parser      = new Parser();
        var interpreter = new Interpreter();

        // ソースファイルを読み込む
        if (src == null)
        {
            // 引数がないなら、ヘルプを表示して終わる
            if (rest.Count != 1)
            {
                showHelpAndExit();
            }

            src = File.ReadAllText(rest[0]);
        }

        // Tokenize を行う
        var tokens = tokenizer.Tokenize(src);

        if (tokenizeOnly)
        {
            Console.WriteLine(string.Join(" ", tokens.Select(t => t.Text).ToArray()));
            exit(0);
        }

        // Parse を行う
        MyLang.Ast.Ast ast;
        if (runAsCalcurator)
        {
            ast = parser.ParseCalcurator(tokens);
        }
        else
        {
            ast = parser.Parse(tokens);
        }

        if (parseOnly)
        {
            Console.WriteLine(new MyLang.AstDisplayer().BuildString(ast, false));
            exit(0);
        }

        // Interpreter で実行する
        interpreter.Run(ast);

        exit(0);
    }
Example #7
0
 public void Initialize()
 {
     m_Tokenizer = new SimpleTokenizer(',', ' ', '!');
 }