Exemple #1
0
        public List <string> GetKeywords(string text, int countOfKeywords, IDFCategory category)
        {
            var document = new Document.TajikDocument(text);

            document.Sentences.ForEach(s => s.NormalizeWords());
            var tfIdf = KEAGlobal.TFIDFManager.CalculateTFIDFWithIDF(document, category).OrderByDescending(s => s.TF_IDF).ThenByDescending(s => s.TF).ThenByDescending(s => s.IDF);

            return(tfIdf.Select(s => s.Word).Take(countOfKeywords).ToList());
        }
Exemple #2
0
        static void CalculateCategory(IDFCategory category, IWordDataSet wordsData, List <TajikDocument> documents)
        {
            var word = new TajikWord(wordsData.Content);
            var idf  = KEAGlobal.TFIDFManager.CalCulateIDF(documents, word);

            if (wordsData.IDFCategoryLinks == null)
            {
                wordsData.IDFCategoryLinks = new List <IDFCategoryLink>();
            }
            wordsData.IDFCategoryLinks.Add(new IDFCategoryLink()
            {
                Category = category, IDF = idf
            });
        }
Exemple #3
0
        static void Main(string[] args)
        {
            TajikKEAContext jsonContext = new TajikKEAContext();

            KEAGlobal.InitiateKEAGlobal(jsonContext);
            PDFHelper pDFHelper = new PDFHelper();

            var badei = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Бадеӣ"
            };
            var badeiDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Бадеи");

            var gumanitari = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Гуманитарӣ"
            };
            var gumanitariDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Гуманитари");

            var иқтисодӣ = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Иқтисодӣ"
            };
            var иқтисодӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Иқтисодӣ");

            var илмидақиқ = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Илми дақиқ"
            };
            var илмидақиқDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Илми дакик");

            var сиёсӣ = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Сиёсӣ"
            };
            var сиёсӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Сиёси");

            var техникӣ = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Техникӣ"
            };
            var техникӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Техники");

            var тиб = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Тиб"
            };
            var тибDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Тиб");

            var физика = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Физика"
            };
            var физикаDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Физика");

            var химия = new IDFCategory()
            {
                Guid = Guid.NewGuid(), Name = "Химия"
            };
            var химияDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Химия");

            List <TajikDocument> allDocuments = badeiDocs.ToList();

            allDocuments.AddRange(gumanitariDocs);
            allDocuments.AddRange(иқтисодӣDocs);
            allDocuments.AddRange(илмидақиқDocs);
            allDocuments.AddRange(сиёсӣDocs);
            allDocuments.AddRange(техникӣDocs);
            allDocuments.AddRange(тибDocs);
            allDocuments.AddRange(физикаDocs);
            allDocuments.AddRange(химияDocs);
            var minimum = 0.00000000000000000000000112;

            foreach (var item in jsonContext.Words)
            {
                CalculateCategory(badei, item, badeiDocs);
                CalculateCategory(gumanitari, item, gumanitariDocs);
                CalculateCategory(иқтисодӣ, item, иқтисодӣDocs);
                CalculateCategory(илмидақиқ, item, илмидақиқDocs);
                CalculateCategory(сиёсӣ, item, сиёсӣDocs);
                CalculateCategory(техникӣ, item, техникӣDocs);
                CalculateCategory(тиб, item, тибDocs);
                CalculateCategory(физика, item, физикаDocs);
                CalculateCategory(химия, item, химияDocs);

                var word = new TajikWord(item.Content);
                var idf  = KEAGlobal.TFIDFManager.CalCulateIDF(allDocuments, word);
                if (idf == 0)
                {
                    idf = minimum;
                }
                item.CommonIDF = idf;
            }
            foreach (var item in jsonContext.Words)
            {
                foreach (var item2 in item.IDFCategoryLinks)
                {
                    if (item2.IDF == 0)
                    {
                        item2.IDF = minimum;
                    }
                }
            }
            var text = JsonConvert.SerializeObject(jsonContext.Words, Formatting.Indented);

            File.WriteAllText("WordAllIDF.json", text);
            Console.ReadLine();
        }
        public List <TFIDFView> CalculateTFIDFWithIDF(Document.TajikDocument documentToCalculate, IDFCategory category = null)
        {
            List <TFIDFView> tFIDFViews    = new List <TFIDFView>();
            var wordsOfDocumentToCalculate = documentToCalculate.Sentences.SelectMany(s => s.Words).ToList();

            foreach (var wordToCalculate in wordsOfDocumentToCalculate.GroupBy(s => s.Value).Select(s => s.FirstOrDefault()))
            {
                var    tFValue = CalCulateTF(wordToCalculate, documentToCalculate);
                double idfValue;
                var    res = KEAGlobal.Context.Words.FirstOrDefault(s => s.Content == wordToCalculate.Value);

                if (res != null)
                {
                    if (category != null)
                    {
                        var categoryLink = res.IDFCategoryLinks?.FirstOrDefault(s => s.Category?.Guid == category.Guid);
                        if (categoryLink != null && categoryLink.IDF > 0)
                        {
                            idfValue = categoryLink.IDF;
                        }
                        else
                        {
                            idfValue = res.CommonIDF;
                        }
                    }
                    else
                    {
                        idfValue = res.CommonIDF;
                    }
                }
                else
                {
                    List <Document.TajikDocument> documentsDataSet = new List <Document.TajikDocument> {
                        documentToCalculate
                    };
                    idfValue = CalCulateIDF(documentsDataSet, wordToCalculate);
                }
                tFIDFViews.Add(CalculateTFIDF(wordToCalculate.Value, idfValue, tFValue));
            }

            return(tFIDFViews);
        }