// return weight for each query term public static List<WeightedTermQuery> weightingQuery(string q, List<Document> ListDocuments) { double wTerm; string queryString = StopwordTool.RemoveStopwords(q); // Regex, untuk menghilangkan angka queryString = Regex.Replace(queryString, @"[0-9]+ ", string.Empty); if(stemCode == 1) { // Stemming, mengubah kata ke bentuk dasarnya StemmingTool Stemmer = new StemmingTool(); queryString = Stemmer.Stemming(queryString); } string[] qTerm = queryString.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); TermWeighting QW = new TermWeighting(ListDocuments); List<WeightedTermQuery> ListQueryWithWeight = new List<WeightedTermQuery>(); for (int i = 0; i < qTerm.Count(); i++) { List<string> found = new List<string>(); // store word that has already counts if (!found.Contains(qTerm[i])) { found.Add(qTerm[i]); // menghitung term weight masing-masing kata di tiap query wTerm = QW.CalculateTermWeightingQuery(qTerm, i, tfQueryCode, idfQueryCode, normQueryCode); //wTerm = 1; ListQueryWithWeight.Add(new WeightedTermQuery(qTerm[i], wTerm)); } } /*Print to console*/ /*Console.Write("QUERY : "); foreach (var item in ListQueryWithWeight) { Console.Write(item.term); Console.Write(item.weight); Console.Write("\n"); }*/ return ListQueryWithWeight; }
public static void createInvertedFileFromListDocuments() { // Split text per document //string[] TextDocuments = documentsContent.Split(new string[] { ".I " }, StringSplitOptions.None); // Make Document Entities //ListDocuments = new List<Document>(); //ListDocumentsFixed = new List<Document>(); //dTitle_NumDoc = new Dictionary<String, int>(); //dTitle_NumDoc.Add("lalala", 1); dDocuments = new Dictionary<string, Dictionary<string, int>>(); for (int i = 1; i < ListDocuments.Count(); i++) { //Console.WriteLine(TextDocuments[i]); Document document = ListDocuments[i]; //Console.WriteLine(document.Title); //ListDocuments.Add(document); //dTitle_NumDoc.Add(document.Title, i); //Console.Write(document.Title); //Console.Write(" - "); //Console.Write(i); //Console.Write("\n"); // input terms in document to dictionary foreach (string term in document.Content.Distinct()) { if (dDocuments.ContainsKey(term)) { dDocuments[term].Add(document.No, (from s in document.Content where s == term select s).Count()); } else { dDocuments.Add(term, new Dictionary<string, int>()); dDocuments[term].Add(document.No, (from s in document.Content where s == term select s).Count()); } } } //print dTitle_NumDoc /* foreach (var item in dTitle_NumDoc) { Console.Write(item.Key); Console.Write(" - "); Console.Write(item.Value); Console.Write("\n"); }*/ //uncomment TermWeighting TW = new TermWeighting(ListDocuments); List<string> ListTermWithWeight = new List<string>(); dTermWeigth = new Dictionary<string, Dictionary<string, double>>(); for (int i = 0; i < ListDocuments.Count(); i++) { List<string> found = new List<string>(); // store word that has already counts for (int j = 0; j < ListDocuments.ElementAt(i).Content.Count(); j++) { //Console.WriteLine(ListDocuments.ElementAt(i).Content[j]); string term = ListDocuments.ElementAt(i).Content[j]; if (!found.Contains(ListDocuments.ElementAt(i).Content[j])) { found.Add(ListDocuments.ElementAt(i).Content[j]); // menghitung term weight masing-masing kata di tiap dokumen //ListTermWithWeight.Add(ListDocuments.ElementAt(i).Content[j] + " " + ListDocuments.ElementAt(i).No + " " + TW.CalculateTermWeightingDocument(i, j, tfDocCode, idfQueryCode, normDocCode)); if (dTermWeigth.ContainsKey(term)) // dictionary already has the term 'key' { dTermWeigth[term].Add(ListDocuments.ElementAt(i).No, TW.CalculateTermWeightingDocument(i, j, tfDocCode, idfQueryCode, normDocCode)); } else // dictionary not yet has the term 'key' { dTermWeigth.Add(term, new Dictionary<string, double>()); dTermWeigth[term].Add(ListDocuments.ElementAt(i).No, TW.CalculateTermWeightingDocument(i, j, tfDocCode, idfQueryCode, normDocCode)); } } } } foreach (KeyValuePair<string, Dictionary<string, double>> entry in dTermWeigth) { string term = entry.Key; foreach (KeyValuePair<string, double> subEntry in entry.Value) { string noDoc = subEntry.Key; double weight = subEntry.Value; ListTermWithWeight.Add(term + " " + noDoc + " " + weight); } } ListTermWithWeight.Sort(); using (StreamWriter writer = new StreamWriter(@outputInvertedFile2)) { foreach (string linestring in ListTermWithWeight) { writer.WriteLine(linestring); } } }