void RemoveUnnassesaryWords(TajikWord word, TajikSentence sentence) { var shouldBeRemoved = ShouldBeRemoved(word.Value); if (shouldBeRemoved) { sentence.Words.Remove(word); } }
void ShakliJam(TajikWord word) { if (pasoyandJam.Any(s => word.Value.EndsWith(s))) { var splited = word.Value.Substring(0, word.Value.Length - 2); if (splited.Length > 1 && Context.Words.Any(s => s.Content == splited)) { word.Value = splited; } } }
static void CalculateCategory(IDFCategory category, IWordDataSet wordsData, List <TajikDocument> documents) { var word = new TajikWord(wordsData.Content); var idf = KEAGlobal.TFIDFManager.CalCulateIDF(documents, word); if (wordsData.IDFCategoryLinks == null) { wordsData.IDFCategoryLinks = new List <IDFCategoryLink>(); } wordsData.IDFCategoryLinks.Add(new IDFCategoryLink() { Category = category, IDF = idf }); }
void Ishorakuni(TajikWord word) { foreach (var ishora in ishorakuni) { if (word.Value.EndsWith(ishora)) { var splited = word.Value.Substring(0, word.Value.Length - ishora.Length); if (splited.Length > 2 && Context.Words.Any(s => s.Content == splited)) { word.Value = splited; } } } }
void BandakiU(TajikWord word) { foreach (var bandak in bandakiU) { if (word.Value.EndsWith(bandak)) { var splited = word.Value.Substring(0, word.Value.Length - bandak.Length); if (splited.Length > 2 && Context.Words.Any(s => s.Content == splited)) { word.Value = splited; } } } }
void BandakiI(TajikWord word) { foreach (var bandak in bandakiI) { if (word.Value.EndsWith(bandak)) { var splited = word.Value.Substring(0, word.Value.Length - bandak.Length); if (splited.Length > 2 && DataSetContains(splited, out string bandakToEnd)) { if (bandakToEnd != null) { word.Value = bandakToEnd; } else { word.Value = splited; } } } } }
static void Main(string[] args) { TajikKEAContext jsonContext = new TajikKEAContext(); KEAGlobal.InitiateKEAGlobal(jsonContext); PDFHelper pDFHelper = new PDFHelper(); var badei = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Бадеӣ" }; var badeiDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Бадеи"); var gumanitari = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Гуманитарӣ" }; var gumanitariDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Гуманитари"); var иқтисодӣ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Иқтисодӣ" }; var иқтисодӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Иқтисодӣ"); var илмидақиқ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Илми дақиқ" }; var илмидақиқDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Илми дакик"); var сиёсӣ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Сиёсӣ" }; var сиёсӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Сиёси"); var техникӣ = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Техникӣ" }; var техникӣDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Техники"); var тиб = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Тиб" }; var тибDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Тиб"); var физика = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Физика" }; var физикаDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Физика"); var химия = new IDFCategory() { Guid = Guid.NewGuid(), Name = "Химия" }; var химияDocs = GetDocuments(pDFHelper, @"C:\Users\dilshodk\Desktop\for me\Химия"); List <TajikDocument> allDocuments = badeiDocs.ToList(); allDocuments.AddRange(gumanitariDocs); allDocuments.AddRange(иқтисодӣDocs); allDocuments.AddRange(илмидақиқDocs); allDocuments.AddRange(сиёсӣDocs); allDocuments.AddRange(техникӣDocs); allDocuments.AddRange(тибDocs); allDocuments.AddRange(физикаDocs); allDocuments.AddRange(химияDocs); var minimum = 0.00000000000000000000000112; foreach (var item in jsonContext.Words) { CalculateCategory(badei, item, badeiDocs); CalculateCategory(gumanitari, item, gumanitariDocs); CalculateCategory(иқтисодӣ, item, иқтисодӣDocs); CalculateCategory(илмидақиқ, item, илмидақиқDocs); CalculateCategory(сиёсӣ, item, сиёсӣDocs); CalculateCategory(техникӣ, item, техникӣDocs); CalculateCategory(тиб, item, тибDocs); CalculateCategory(физика, item, физикаDocs); CalculateCategory(химия, item, химияDocs); var word = new TajikWord(item.Content); var idf = KEAGlobal.TFIDFManager.CalCulateIDF(allDocuments, word); if (idf == 0) { idf = minimum; } item.CommonIDF = idf; } foreach (var item in jsonContext.Words) { foreach (var item2 in item.IDFCategoryLinks) { if (item2.IDF == 0) { item2.IDF = minimum; } } } var text = JsonConvert.SerializeObject(jsonContext.Words, Formatting.Indented); File.WriteAllText("WordAllIDF.json", text); Console.ReadLine(); }
public TF(TajikWord termin, Document.TajikDocument document) { Termin = termin; Document = document; }
public double CalCulateIDF(List <Document.TajikDocument> documentsDataSet, TajikWord wordToCalculate) { IDF iDF = new IDF(documentsDataSet, wordToCalculate); return(iDF.CalculateIDF()); }
public double CalCulateTF(TajikWord wordToCalculate, Document.TajikDocument documentToCalculate) { TF tF = new TF(wordToCalculate, documentToCalculate); return(tF.CalculateTF()); }
public IDF(IEnumerable <Document.TajikDocument> documents, TajikWord termin) { Documents = documents; Termin = termin; }