示例#1
0
        static void Main(string[] args)
        {
            string link       = "https://ru.wikipedia.org/";
            string folderPath = @"D:\IS\";

            //1 задание
            //var crawler = new Crawler.Crawler(folderPath);
            //crawler.Crawl(link);

            //2 задание
            //var stemming = new Stemming(link, folderPath);
            //stemming.StartStemming();

            ////3 задание
            //var inverter = new InvertList(link, folderPath);
            ////inverter.Invert();
            //var search = inverter.Search("оформляет & случае & качестве");
            //Console.WriteLine($"Найдено в документах с индексами: {string.Join(", ", search)}");

            ////4 задание
            var tdidf = new TfIdf(link, folderPath);
            //tdidf.TF();
            //tdidf.Idf();
            //tdidf.TfIdfCalc();

            //5 задание
            var searchResult = new Search(link, folderPath);

            searchResult.SearchWord("патрулирован свободный википедия");
        }
示例#2
0
        private static void CategoryPartOnTrainSet()
        {
            TfIdf.LoadDb();

            var testSet = DataLoad.AsrTestSet(DataLoad.FetchAsrDocuments);

            float correct   = 0;
            float incorrect = 0;

            foreach (var document in testSet)
            {
                var(wordList, cat) = document.Item;

                var scores = new Dictionary <AsrIranCategories, double>();

                foreach (var e in Enum.GetValues(typeof(AsrIranCategories)))
                {
                    var en = (AsrIranCategories)e;
                    scores.Add(en, 0);
                }

                Console.WriteLine("Calculating...");
                foreach (var word in wordList)
                {
                    foreach (var e in Enum.GetValues(typeof(AsrIranCategories)))
                    {
                        var categorie = (AsrIranCategories)e;
                        scores[categorie] += TfIdf.CalculateTfIdf(word, categorie);
                    }
                }

                Console.WriteLine();
                Console.WriteLine();
                Console.ForegroundColor = ConsoleColor.Blue;
                Console.WriteLine("Correct Class: " + cat);
                var orderedScore = scores.OrderByDescending(x => x.Value);
                if (orderedScore.First().Key != cat)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    incorrect++;
                }
                else
                {
                    Console.ForegroundColor = ConsoleColor.Green;
                    correct++;
                }

                foreach (var item in orderedScore.Take(5))
                {
                    Console.WriteLine($"{item.Key} = {item.Value}");
                    Console.ResetColor();
                }
                Console.WriteLine();
                Console.WriteLine();

                Console.WriteLine("===== ACURACY  : " + correct * 100 / (correct + incorrect) + "======");
            }
        }
示例#3
0
        static void Main(string[] args)
        {
            Dictionary <string, IEnumerable <MeCabResultAggregate> > aggregate = new Dictionary <string, IEnumerable <MeCabResultAggregate> >();

            if (args.Length == 0)
            {
                Console.Error.WriteLine("ディレクトリを指定してください");
                return;
            }

            foreach (string a in args)
            {
                foreach (string f in Directory.EnumerateFiles(a, "*.json", SearchOption.TopDirectoryOnly))
                {
                    using (StreamReader reader = new StreamReader(f))
                    {
                        aggregate[f] = JsonConvert.DeserializeObject <IEnumerable <MeCabResultAggregate> >(reader.ReadToEnd());
                    }
                    Console.WriteLine("{0}内の単語数 : {1}", f, aggregate[f].Count());
                }
            }

            Dictionary <string, Dictionary <string, double> > tfIdf = TfIdf.GetAllTfIdf(aggregate);

            Console.WriteLine("対象のファイル数 : {0}", aggregate.Count);

            foreach (KeyValuePair <string, Dictionary <string, double> > t in tfIdf)
            {
                string dist = "計算結果/" + t.Key;
                if (!Directory.Exists(Path.GetDirectoryName(dist)))
                {
                    Directory.CreateDirectory(Path.GetDirectoryName(dist));
                }

                using (StreamWriter writer = new StreamWriter(dist))
                {
                    writer.Write(JsonConvert.SerializeObject(t.Value, Formatting.Indented));
                }
            }

            foreach (KeyValuePair <string, Dictionary <string, double> > n in TfIdf.Normalize(tfIdf))
            {
                string dist = "計算結果_正規化/" + n.Key;
                if (!Directory.Exists(Path.GetDirectoryName(dist)))
                {
                    Directory.CreateDirectory(Path.GetDirectoryName(dist));
                }

                using (StreamWriter writer = new StreamWriter(dist))
                {
                    writer.Write(JsonConvert.SerializeObject(n.Value, Formatting.Indented));
                }
            }

            return;
        }
        public PredictionEngine()
        {
            // 문서 유사도 분석을 위한 TfIdf
            tfidf = new TfIdf();


            // Assets 폴더내 파일 읽기
            Stream filename = Android.App.Application.Context.Assets.Open("msg_non_ratio_4ngram_trainset.csv");

            // 미리 n그램으로 분리된 문서를 불러와서 예측하는데 걸리는 연산시간을 단축

            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
            tfidf.Load_documents(filename, Encoding.GetEncoding("euc-kr"));
        }
示例#5
0
        static void Main(string[] args)
        {
            Console.Write("Enter ngram size wanted : ");
            string value = Console.ReadLine();

            Console.Write(Environment.NewLine);
            int n = 0;

            while (!int.TryParse(value, out n))
            {
                Console.Write("Error bad integer value try again : ");
                value = Console.ReadLine();
                Console.Write(Environment.NewLine);
            }

            try
            {
                // Get texts content
                IEnumerable <string> texts = FileManager.GetFilesContent(Defaults.dataFilesPath);

                // Get ngrams for all texts
                List <string> nGramsResult = new List <string>();
                Ngrams        ng           = new Ngrams();
                var           options      = new ParallelOptions
                {
                    MaxDegreeOfParallelism = 20
                };

                Parallel.ForEach(texts, options, text =>
                {
                    var temp = ng.GetNgramsWords(text, n);
                    nGramsResult.AddRange(temp);
                }
                                 );
                // Save Ngrams to a file
                FileManager.SaveFile(Defaults.resultNGramFilePath, nGramsResult);

                // Sort ngrams (and delete duplicate in the same time) with tf idf
                var tfIdfResult = TfIdf.SortNGrams(nGramsResult);

                // Save sorted tf idf to a file
                FileManager.SaveFile(Defaults.resultTfIdfFilePath, tfIdfResult);
            }
            catch (Exception e)
            {
                Console.WriteLine(e.ToString());
            }
            Console.ReadKey();
        }
示例#6
0
        public void TfIdfMeasureIsCorrectEvenIfWordsDoesNotExistInIdf()
        {
            var idf = new Mock<IIdf>();
            idf.Setup(x => x.Value(It.Is<string>(y => y == "test"))).Returns(0.23);
            idf.Setup(x => x.Value(It.Is<string>(y => y == "two"))).Returns(1d);
            var tf = new Mock<ITermWeightRepresentation>();
            tf.Setup(x => x.TermWeight(It.Is<string>(y => y == "test"))).Returns(0.78);
            tf.Setup(x => x.TermWeight(It.Is<string>(y => y == "two"))).Returns(0.5);
            tf.Setup(x => x.TermWeight(It.Is<string>(y => y == "third"))).Returns(0.5);

            var tfIdf = new TfIdf(tf.Object, idf.Object);

            Assert.AreEqual(0.1794, tfIdf.TermWeight("test"), 0.0001);
            Assert.AreEqual(0.5, tfIdf.TermWeight("two"), 0.0001);
            Assert.AreEqual(0d, tfIdf.TermWeight("third"), 0.0001);
        }
示例#7
0
        private static void GetUserInputCategory()
        {
            TfIdf.LoadDb();

            Console.Write("Enter file path: ");
            var path = Console.ReadLine();

            Console.WriteLine();
            var text = File.ReadAllText(path);

            Console.WriteLine(text);
            var tokens = DataCleanser.CleanseUserInput(text);

            var scores = new Dictionary <AsrIranCategories, double>();

            foreach (var e in Enum.GetValues(typeof(AsrIranCategories)))
            {
                var en = (AsrIranCategories)e;
                scores.Add(en, 0);
            }

            Console.WriteLine("Calculating...");
            foreach (var word in tokens)
            {
                foreach (var e in Enum.GetValues(typeof(AsrIranCategories)))
                {
                    var categorie = (AsrIranCategories)e;
                    scores[categorie] += TfIdf.CalculateTfIdf(word, categorie);
                }
            }

            Console.WriteLine();
            Console.WriteLine();
            Console.ForegroundColor = ConsoleColor.Blue;
            var orderedScore = scores.OrderByDescending(x => x.Value);

            foreach (var item in orderedScore)
            {
                Console.WriteLine($"{item.Key} = {item.Value}");
            }
            Console.ResetColor();
            Console.WriteLine();
            Console.WriteLine();
        }
示例#8
0
        public Dictionary <IToken, TfIdf> Calculate(IEnumerable <IToken> tokenizedText)
        {
            if (tokenizedText == null)
            {
                throw new ArgumentNullException("tokenizedText");
            }

            Dictionary <IToken, int> groupedTokens = tokenizedText.GroupBy(t => t)
                                                     .ToDictionary(g => g.Key, g => g.Count());
            int maxFrequency = groupedTokens.Values.Max(v => v);
            var result       = new Dictionary <IToken, TfIdf>();

            foreach (var groupedToken in groupedTokens)
            {
                double tf         = 0.5 + 0.5 * groupedToken.Value / maxFrequency;
                double df         = _dfProvider.GeDocumentsWithTokenCount(groupedToken.Key);
                double idf        = Math.Log((_dfProvider.CorpusSize + 1.0) / (df + 1.0));
                double tfidfValue = tf * idf;
                var    tfidf      = new TfIdf(tfidfValue, groupedToken.Key);
                result.Add(tfidf.Token, tfidf);
            }
            return(result);
        }