static void Main(string[] args) { string link = "https://ru.wikipedia.org/"; string folderPath = @"D:\IS\"; //1 задание //var crawler = new Crawler.Crawler(folderPath); //crawler.Crawl(link); //2 задание //var stemming = new Stemming(link, folderPath); //stemming.StartStemming(); ////3 задание //var inverter = new InvertList(link, folderPath); ////inverter.Invert(); //var search = inverter.Search("оформляет & случае & качестве"); //Console.WriteLine($"Найдено в документах с индексами: {string.Join(", ", search)}"); ////4 задание var tdidf = new TfIdf(link, folderPath); //tdidf.TF(); //tdidf.Idf(); //tdidf.TfIdfCalc(); //5 задание var searchResult = new Search(link, folderPath); searchResult.SearchWord("патрулирован свободный википедия"); }
private static void CategoryPartOnTrainSet() { TfIdf.LoadDb(); var testSet = DataLoad.AsrTestSet(DataLoad.FetchAsrDocuments); float correct = 0; float incorrect = 0; foreach (var document in testSet) { var(wordList, cat) = document.Item; var scores = new Dictionary <AsrIranCategories, double>(); foreach (var e in Enum.GetValues(typeof(AsrIranCategories))) { var en = (AsrIranCategories)e; scores.Add(en, 0); } Console.WriteLine("Calculating..."); foreach (var word in wordList) { foreach (var e in Enum.GetValues(typeof(AsrIranCategories))) { var categorie = (AsrIranCategories)e; scores[categorie] += TfIdf.CalculateTfIdf(word, categorie); } } Console.WriteLine(); Console.WriteLine(); Console.ForegroundColor = ConsoleColor.Blue; Console.WriteLine("Correct Class: " + cat); var orderedScore = scores.OrderByDescending(x => x.Value); if (orderedScore.First().Key != cat) { Console.ForegroundColor = ConsoleColor.Red; incorrect++; } else { Console.ForegroundColor = ConsoleColor.Green; correct++; } foreach (var item in orderedScore.Take(5)) { Console.WriteLine($"{item.Key} = {item.Value}"); Console.ResetColor(); } Console.WriteLine(); Console.WriteLine(); Console.WriteLine("===== ACURACY : " + correct * 100 / (correct + incorrect) + "======"); } }
static void Main(string[] args) { Dictionary <string, IEnumerable <MeCabResultAggregate> > aggregate = new Dictionary <string, IEnumerable <MeCabResultAggregate> >(); if (args.Length == 0) { Console.Error.WriteLine("ディレクトリを指定してください"); return; } foreach (string a in args) { foreach (string f in Directory.EnumerateFiles(a, "*.json", SearchOption.TopDirectoryOnly)) { using (StreamReader reader = new StreamReader(f)) { aggregate[f] = JsonConvert.DeserializeObject <IEnumerable <MeCabResultAggregate> >(reader.ReadToEnd()); } Console.WriteLine("{0}内の単語数 : {1}", f, aggregate[f].Count()); } } Dictionary <string, Dictionary <string, double> > tfIdf = TfIdf.GetAllTfIdf(aggregate); Console.WriteLine("対象のファイル数 : {0}", aggregate.Count); foreach (KeyValuePair <string, Dictionary <string, double> > t in tfIdf) { string dist = "計算結果/" + t.Key; if (!Directory.Exists(Path.GetDirectoryName(dist))) { Directory.CreateDirectory(Path.GetDirectoryName(dist)); } using (StreamWriter writer = new StreamWriter(dist)) { writer.Write(JsonConvert.SerializeObject(t.Value, Formatting.Indented)); } } foreach (KeyValuePair <string, Dictionary <string, double> > n in TfIdf.Normalize(tfIdf)) { string dist = "計算結果_正規化/" + n.Key; if (!Directory.Exists(Path.GetDirectoryName(dist))) { Directory.CreateDirectory(Path.GetDirectoryName(dist)); } using (StreamWriter writer = new StreamWriter(dist)) { writer.Write(JsonConvert.SerializeObject(n.Value, Formatting.Indented)); } } return; }
public PredictionEngine() { // 문서 유사도 분석을 위한 TfIdf tfidf = new TfIdf(); // Assets 폴더내 파일 읽기 Stream filename = Android.App.Application.Context.Assets.Open("msg_non_ratio_4ngram_trainset.csv"); // 미리 n그램으로 분리된 문서를 불러와서 예측하는데 걸리는 연산시간을 단축 Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); tfidf.Load_documents(filename, Encoding.GetEncoding("euc-kr")); }
static void Main(string[] args) { Console.Write("Enter ngram size wanted : "); string value = Console.ReadLine(); Console.Write(Environment.NewLine); int n = 0; while (!int.TryParse(value, out n)) { Console.Write("Error bad integer value try again : "); value = Console.ReadLine(); Console.Write(Environment.NewLine); } try { // Get texts content IEnumerable <string> texts = FileManager.GetFilesContent(Defaults.dataFilesPath); // Get ngrams for all texts List <string> nGramsResult = new List <string>(); Ngrams ng = new Ngrams(); var options = new ParallelOptions { MaxDegreeOfParallelism = 20 }; Parallel.ForEach(texts, options, text => { var temp = ng.GetNgramsWords(text, n); nGramsResult.AddRange(temp); } ); // Save Ngrams to a file FileManager.SaveFile(Defaults.resultNGramFilePath, nGramsResult); // Sort ngrams (and delete duplicate in the same time) with tf idf var tfIdfResult = TfIdf.SortNGrams(nGramsResult); // Save sorted tf idf to a file FileManager.SaveFile(Defaults.resultTfIdfFilePath, tfIdfResult); } catch (Exception e) { Console.WriteLine(e.ToString()); } Console.ReadKey(); }
public void TfIdfMeasureIsCorrectEvenIfWordsDoesNotExistInIdf() { var idf = new Mock<IIdf>(); idf.Setup(x => x.Value(It.Is<string>(y => y == "test"))).Returns(0.23); idf.Setup(x => x.Value(It.Is<string>(y => y == "two"))).Returns(1d); var tf = new Mock<ITermWeightRepresentation>(); tf.Setup(x => x.TermWeight(It.Is<string>(y => y == "test"))).Returns(0.78); tf.Setup(x => x.TermWeight(It.Is<string>(y => y == "two"))).Returns(0.5); tf.Setup(x => x.TermWeight(It.Is<string>(y => y == "third"))).Returns(0.5); var tfIdf = new TfIdf(tf.Object, idf.Object); Assert.AreEqual(0.1794, tfIdf.TermWeight("test"), 0.0001); Assert.AreEqual(0.5, tfIdf.TermWeight("two"), 0.0001); Assert.AreEqual(0d, tfIdf.TermWeight("third"), 0.0001); }
private static void GetUserInputCategory() { TfIdf.LoadDb(); Console.Write("Enter file path: "); var path = Console.ReadLine(); Console.WriteLine(); var text = File.ReadAllText(path); Console.WriteLine(text); var tokens = DataCleanser.CleanseUserInput(text); var scores = new Dictionary <AsrIranCategories, double>(); foreach (var e in Enum.GetValues(typeof(AsrIranCategories))) { var en = (AsrIranCategories)e; scores.Add(en, 0); } Console.WriteLine("Calculating..."); foreach (var word in tokens) { foreach (var e in Enum.GetValues(typeof(AsrIranCategories))) { var categorie = (AsrIranCategories)e; scores[categorie] += TfIdf.CalculateTfIdf(word, categorie); } } Console.WriteLine(); Console.WriteLine(); Console.ForegroundColor = ConsoleColor.Blue; var orderedScore = scores.OrderByDescending(x => x.Value); foreach (var item in orderedScore) { Console.WriteLine($"{item.Key} = {item.Value}"); } Console.ResetColor(); Console.WriteLine(); Console.WriteLine(); }
public Dictionary <IToken, TfIdf> Calculate(IEnumerable <IToken> tokenizedText) { if (tokenizedText == null) { throw new ArgumentNullException("tokenizedText"); } Dictionary <IToken, int> groupedTokens = tokenizedText.GroupBy(t => t) .ToDictionary(g => g.Key, g => g.Count()); int maxFrequency = groupedTokens.Values.Max(v => v); var result = new Dictionary <IToken, TfIdf>(); foreach (var groupedToken in groupedTokens) { double tf = 0.5 + 0.5 * groupedToken.Value / maxFrequency; double df = _dfProvider.GeDocumentsWithTokenCount(groupedToken.Key); double idf = Math.Log((_dfProvider.CorpusSize + 1.0) / (df + 1.0)); double tfidfValue = tf * idf; var tfidf = new TfIdf(tfidfValue, groupedToken.Key); result.Add(tfidf.Token, tfidf); } return(result); }