コード例 #1
0
    static void Main(string[] args)
    {
        string        result_folder    = @"f:\tmp";
        List <string> parsed_sentences = new List <string>();

        int    MAX_SAMPLE       = int.MaxValue;
        int    MAX_LEN          = int.MaxValue;
        string dictionary_path  = "";
        string syntax_templates = "";

        string[] filters = { "3" };

        #region Command_Line_Options
        for (int i = 0; i < args.Length; ++i)
        {
            if (args[i] == "-parsing")
            {
                parsed_sentences.Add(args[i + 1]);
                i++;
            }
            else if (args[i] == "-dict")
            {
                dictionary_path = args[i + 1];
                i++;
            }
            else if (args[i] == "-templates")
            {
                syntax_templates = args[i + 1];
                i++;
            }
            else if (args[i] == "-output")
            {
                result_folder = args[i + 1];
                i++;
            }
            else if (args[i] == "-max_samples")
            {
                MAX_SAMPLE = int.Parse(args[i + 1]);
                i++;
            }
            else if (args[i] == "-skip")
            {
                nb_skip = int.Parse(args[i + 1]);
                i++;
            }
            else if (args[i] == "-max_len")
            {
                MAX_LEN = int.Parse(args[i + 1]);
                i++;
            }
            else if (args[i] == "-filter")
            {
                filters = args[i + 1].Split(',');
                i++;
            }
            else
            {
                throw new ApplicationException(string.Format("Unknown option {0}", args[i]));
            }
        }
        #endregion Command_Line_Options

        preprocessor = new Preprocessor();
        if (!string.IsNullOrEmpty(syntax_templates))
        {
            syntax_checker.LoadTemplates(syntax_templates);
        }

        Console.WriteLine("Loading dictionary {0}", dictionary_path);
        SolarixGrammarEngineNET.GrammarEngine2 gren = new SolarixGrammarEngineNET.GrammarEngine2();
        gren.Load(dictionary_path, true);

        // Файл для сохранения отобранных предложений-фактов.
        wrt_samples = new System.IO.StreamWriter(System.IO.Path.Combine(result_folder, "facts.txt"));

        // Предложения, которые не прошли детальную проверку синтаксической структуры
        wrt_skipped = new System.IO.StreamWriter(System.IO.Path.Combine(result_folder, "skipped.txt"));

        // Фильтр для предиката, с возможными значениями "3", "2s" и "1s"
        string filter_verb = filters.Where(z => "3 1s 2s".Split(' ').Contains(z)).FirstOrDefault();

        // Фильтр типа предложения. Допустимые значение - пустое или "q"
        string filter_sent = (filters.Where(z => "q".Split(' ').Contains(z)).FirstOrDefault()) ?? "";


        DateTime start_time = DateTime.Now;

        #region Processing_All_Files
        foreach (string mask in parsed_sentences)
        {
            string[] files = null;
            if (System.IO.Directory.Exists(mask))
            {
                files = System.IO.Directory.GetFiles(mask, "*.parsing.txt");
            }
            else if (mask.IndexOfAny("*?".ToCharArray()) != -1)
            {
                files = System.IO.Directory.GetFiles(System.IO.Path.GetDirectoryName(mask), System.IO.Path.GetFileName(mask));
            }
            else
            {
                files = new string[1] {
                    mask
                };
            }

            Console.WriteLine("Number of parsing files={0}", files.Length);

            foreach (string file in files)
            {
                if (sample_count >= MAX_SAMPLE)
                {
                    break;
                }

                Console.WriteLine("Processing {0}...", file);

                using (Sentences src = new Sentences(file))
                {
                    while (src.Next() && sample_count < MAX_SAMPLE)
                    {
                        Sentence sent = src.GetFetched();
                        sample_count++;

                        if (sent.root == null)
                        {
                            continue;
                        }

                        if (sample_count > 0 && (sample_count % 10000) == 0)
                        {
                            Console.Write("{0} samples extracted\r", sample_count);
                        }

                        if (sample_count >= nb_skip)
                        {
                            //Console.WriteLine("DEBUG [{0}] {1}", sample_count, sent.GetText());
                            ProcessSentence(gren, sent, MAX_LEN, filter_verb, filter_sent);
                        }
                    }
                }
            }
        }
        #endregion Processing_All_Files


        wrt_samples.Close();
        wrt_skipped.Close();

        return;
    }