public List <string> GetKeywords(string text, int countOfKeywords, IDFCategory category) { var document = new Document.TajikDocument(text); document.Sentences.ForEach(s => s.NormalizeWords()); var tfIdf = KEAGlobal.TFIDFManager.CalculateTFIDFWithIDF(document, category).OrderByDescending(s => s.TF_IDF).ThenByDescending(s => s.TF).ThenByDescending(s => s.IDF); return(tfIdf.Select(s => s.Word).Take(countOfKeywords).ToList()); }
static void CalculateCategory(IDFCategory category, IWordDataSet wordsData, List <TajikDocument> documents) { var word = new TajikWord(wordsData.Content); var idf = KEAGlobal.TFIDFManager.CalCulateIDF(documents, word); if (wordsData.IDFCategoryLinks == null) { wordsData.IDFCategoryLinks = new List <IDFCategoryLink>(); } wordsData.IDFCategoryLinks.Add(new IDFCategoryLink() { Category = category, IDF = idf }); }
static void Main(string[] args) { TajikKEAContext jsonContext = new TajikKEAContext(); KEAGlobal.InitiateKEAGlobal(jsonContext); PDFHelper pDFHelper = new PDFHelper(); var badei = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Бадеӣ" }; var badeiDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Бадеи"); var gumanitari = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Гуманитарӣ" }; var gumanitariDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Гуманитари"); var иқтисодӣ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Иқтисодӣ" }; var иқтисодӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Иқтисодӣ"); var илмидақиқ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Илми дақиқ" }; var илмидақиқDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Илми дакик"); var сиёсӣ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Сиёсӣ" }; var сиёсӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Сиёси"); var техникӣ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Техникӣ" }; var техникӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Техники"); var тиб = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Тиб" }; var тибDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Тиб"); var физика = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Физика" }; var физикаDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Физика"); var химия = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Химия" }; var химияDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Химия"); List <TajikDocument> allDocuments = badeiDocs.ToList(); allDocuments.AddRange(gumanitariDocs); allDocuments.AddRange(иқтисодӣDocs); allDocuments.AddRange(илмидақиқDocs); allDocuments.AddRange(сиёсӣDocs); allDocuments.AddRange(техникӣDocs); allDocuments.AddRange(тибDocs); allDocuments.AddRange(физикаDocs); allDocuments.AddRange(химияDocs); var minimum = 0.00000000000000000000000112; foreach (var item in jsonContext.Words) { CalculateCategory(badei, item, badeiDocs); CalculateCategory(gumanitari, item, gumanitariDocs); CalculateCategory(иқтисодӣ, item, иқтисодӣDocs); CalculateCategory(илмидақиқ, item, илмидақиқDocs); CalculateCategory(сиёсӣ, item, сиёсӣDocs); CalculateCategory(техникӣ, item, техникӣDocs); CalculateCategory(тиб, item, тибDocs); CalculateCategory(физика, item, физикаDocs); CalculateCategory(химия, item, химияDocs); var word = new TajikWord(item.Content); var idf = KEAGlobal.TFIDFManager.CalCulateIDF(allDocuments, word); if (idf == 0) { idf = minimum; } item.CommonIDF = idf; } foreach (var item in jsonContext.Words) { foreach (var item2 in item.IDFCategoryLinks) { if (item2.IDF == 0) { item2.IDF = minimum; } } } var text = JsonConvert.SerializeObject(jsonContext.Words, Formatting.Indented); File.WriteAllText("WordAllIDF.json", text); Console.ReadLine(); }
public List <TFIDFView> CalculateTFIDFWithIDF(Document.TajikDocument documentToCalculate, IDFCategory category = null) { List <TFIDFView> tFIDFViews = new List <TFIDFView>(); var wordsOfDocumentToCalculate = documentToCalculate.Sentences.SelectMany(s => s.Words).ToList(); foreach (var wordToCalculate in wordsOfDocumentToCalculate.GroupBy(s => s.Value).Select(s => s.FirstOrDefault())) { var tFValue = CalCulateTF(wordToCalculate, documentToCalculate); double idfValue; var res = KEAGlobal.Context.Words.FirstOrDefault(s => s.Content == wordToCalculate.Value); if (res != null) { if (category != null) { var categoryLink = res.IDFCategoryLinks?.FirstOrDefault(s => s.Category?.Guid == category.Guid); if (categoryLink != null && categoryLink.IDF > 0) { idfValue = categoryLink.IDF; } else { idfValue = res.CommonIDF; } } else { idfValue = res.CommonIDF; } } else { List <Document.TajikDocument> documentsDataSet = new List <Document.TajikDocument> { documentToCalculate }; idfValue = CalCulateIDF(documentsDataSet, wordToCalculate); } tFIDFViews.Add(CalculateTFIDF(wordToCalculate.Value, idfValue, tFValue)); } return(tFIDFViews); }