public static Dictionary<int, double> retrive(Query query) { searchSetTableAdapters.Document_TermTableAdapter documentTermTableAdapter = new searchSetTableAdapters.Document_TermTableAdapter(); searchSet.Document_TermDataTable documentTermDataTable = new searchSet.Document_TermDataTable(); Dictionary<int, double> list_document = new Dictionary<int, double>(); foreach (KeyValuePair<string, double> entry in query.terms) { documentTermDataTable = documentTermTableAdapter.GetDataByTerm(entry.Key); foreach (searchSet.Document_TermRow row in documentTermDataTable) { double SC = entry.Value * row.Weight; if (list_document.ContainsKey(row.Document_ID)) { list_document[row.Document_ID] += SC; } else { list_document.Add(row.Document_ID, SC); } } } return list_document; }
public void openDocumentFile(string pathName) { searchSetTableAdapters.DocumentsTableAdapter documentTableAdapter = new searchSetTableAdapters.DocumentsTableAdapter(); searchSetTableAdapters.Document_TermTableAdapter documentTermTableAdapter = new searchSetTableAdapters.Document_TermTableAdapter(); searchSetTableAdapters.TermsTableAdapter termTableAdapter = new searchSetTableAdapters.TermsTableAdapter(); documentTableAdapter.DeleteAll(); documentTermTableAdapter.DeleteAll(); termTableAdapter.DeleteAll(); searchSet.DocumentsRow doc; System.IO.StreamReader file = new System.IO.StreamReader(pathName); string line; int max_term; char nowState = ' '; int number = 0; StringBuilder titleBuilder = new StringBuilder(""); string title = ""; string authors = ""; string content = ""; StringBuilder contentBuilder = new StringBuilder(""); Dictionary<string, int> termOccurenceInDocument = new Dictionary<string, int>(); int jumlah_dokumen = 0; while ((line = file.ReadLine()) != null) { if (line.Length > 1) { if (line[0] == '.') { if (line[1] == 'I') { //if (nowState == 'W') //{ title = titleBuilder.ToString(); content = contentBuilder.ToString().Trim(); StringBuilder allContentsInDocument = new StringBuilder(""); allContentsInDocument.Append(title); allContentsInDocument.Append(" "); allContentsInDocument.Append(content); allContentsInDocument.Append(" "); allContentsInDocument.Append(authors); List<String> splittedTerms = new List<String>(); splittedTerms = Utility.separateData(allContentsInDocument.ToString().ToLower()); splittedTerms = splittedTerms.Where(s => !string.IsNullOrWhiteSpace(s)).ToList(); doc = documentDataTable.NewDocumentsRow(); doc.ID = number; doc.Title = title; doc.Content = content; doc.Authors = authors; documentDataTable.AddDocumentsRow(doc); //documentTableAdapter.Insert(number, title, content, authors.ToString()); splittedTerms = Utility.removeStopWord(splittedTerms); if (config.stemmingOption == 1) { splittedTerms = Utility.stemming(splittedTerms); } max_term = 0; termOccurenceInDocument = new Dictionary<string, int>(); foreach (string s in splittedTerms) { if (!termOccurenceInDocument.ContainsKey(s)) { termOccurenceInDocument[s] = 1; if (!termOccurenceInCollection.ContainsKey(s)) termOccurenceInCollection[s] = 1; else termOccurenceInDocument[s]++; } else termOccurenceInDocument[s]++; if (max_term < termOccurenceInDocument[s]) max_term = termOccurenceInDocument[s]; } //documentTermDataTable = new searchSet.Document_TermDataTable(); foreach (KeyValuePair<string, int> entry in termOccurenceInDocument) { searchSet.TermsRow term = termsDataTable.NewTermsRow(); term.Term = entry.Key; term.IDF = 1; try { termsDataTable.AddTermsRow(term); //termTableAdapter.Insert(entry.Key, 0); } catch (Exception x) { } searchSet.Document_TermRow dterm = documentTermDataTable.NewDocument_TermRow(); dterm.Term = entry.Key; dterm.Document_ID = number; dterm.Weight = calculateTF(entry.Value, max_term); documentTermDataTable.AddDocument_TermRow(dterm); //documentTermTableAdapter.Insert(entry.Key, number, entry.Value); } jumlah_dokumen++; progressReporter.ReportProgress(jumlah_dokumen); //termTableAdapter.Update(termsDataTable); //documentTermTableAdapter.Update(documentTermDataTable); //} nowState = 'I'; number = Int32.Parse(line.Split(' ')[1]); content = ""; authors = ""; contentBuilder = new StringBuilder(""); titleBuilder = new StringBuilder(""); } else if (line[1] == 'T') { nowState = 'T'; titleBuilder = new StringBuilder(""); } else if (line[1] == 'A') { if (nowState != 'A') { title = titleBuilder.ToString().Trim(); } nowState = 'A'; authors = ""; } else if (line[1] == 'W') { nowState = 'W'; contentBuilder = new StringBuilder(""); } } else { if (nowState == 'T') { titleBuilder.Append(line); titleBuilder.Append(" "); } else if (nowState == 'A') { if (authors.Length > 0) authors += ", "; authors += line.Trim(); } else if (nowState == 'W') { contentBuilder.Append(line); contentBuilder.Append(" "); } } } } file.Close(); // last document content = contentBuilder.ToString().Trim(); title = titleBuilder.ToString(); StringBuilder allContentsInDoc = new StringBuilder(""); allContentsInDoc.Append(title); allContentsInDoc.Append(" "); allContentsInDoc.Append(content); allContentsInDoc.Append(" "); allContentsInDoc.Append(authors); List<String> splitTerm = new List<String>(); splitTerm = Utility.separateData(allContentsInDoc.ToString().ToLower()); splitTerm = splitTerm.Where(s => !string.IsNullOrWhiteSpace(s)).ToList(); //////////// update doc //documentTableAdapter.Update(documentDataTable); doc = documentDataTable.NewDocumentsRow(); doc.ID = number; doc.Title = title; doc.Content = content; doc.Authors = authors; documentDataTable.AddDocumentsRow(doc); jumlah_dokumen++; progressReporter.ReportProgress(jumlah_dokumen); //documentTableAdapter.Insert(number, title, content, authors.ToString()); splitTerm = Utility.removeStopWord(splitTerm); if (config.stemmingOption == 1) { splitTerm = Utility.stemming(splitTerm); } max_term = 0; termOccurenceInDocument = new Dictionary<string, int>(); foreach (string s in splitTerm) { if (!termOccurenceInDocument.ContainsKey(s)) { termOccurenceInDocument[s] = 1; if (!termOccurenceInCollection.ContainsKey(s)) termOccurenceInCollection[s] = 1; else termOccurenceInDocument[s]++; } else termOccurenceInDocument[s]++; if (max_term < termOccurenceInDocument[s]) max_term = termOccurenceInDocument[s]; } //documentTermDataTable = new searchSet.Document_TermDataTable(); //termsDataTable = new searchSet.TermsDataTable(); foreach (KeyValuePair<string, int> entry in termOccurenceInDocument) { searchSet.TermsRow term = termsDataTable.NewTermsRow(); term.Term = entry.Key; term.IDF = 1; try { termsDataTable.AddTermsRow(term); //termTableAdapter.Insert(entry.Key, 0); } catch (Exception x) { } searchSet.Document_TermRow dterm = documentTermDataTable.NewDocument_TermRow(); dterm.Term = entry.Key; dterm.Document_ID = number; dterm.Weight = calculateTF(entry.Value, max_term); documentTermDataTable.AddDocument_TermRow(dterm); //documentTermTableAdapter.Insert(entry.Key, number, entry.Value); } if (config.idfOption == 1) { foreach (KeyValuePair<string, int> entry in termOccurenceInCollection) { termsDataTable.FindByTerm(entry.Key).IDF = (double)number / (double) entry.Value; } } calculateWeight(); // save to database documentTableAdapter.Update(documentDataTable); termTableAdapter.Update(termsDataTable); documentTermTableAdapter.Update(documentTermDataTable); if (MainForm.doc_config.normalizationOption == 1) { foreach (searchSet.DocumentsRow doc_row in documentDataTable) { double panjang = (double) documentTermTableAdapter.HitungPanjang(doc_row.ID); documentTermTableAdapter.NormalisasiWeight(panjang, doc_row.ID); } } }
public void applyAlgorithm(Query query) { double pembagi_relevan, pembagi_irelevan; if (MainForm.feedback_config.algorithm == 0) { pembagi_irelevan = irelevant_doc.Count; pembagi_relevan = relevant_doc.Count; } else { pembagi_irelevan = 1; pembagi_relevan = 1; } foreach (int rel in relevant_doc) { searchSetTableAdapters.Document_TermTableAdapter documentTermTableAdapter = new searchSetTableAdapters.Document_TermTableAdapter(); searchSet.Document_TermDataTable documentTermDataTable = new searchSet.Document_TermDataTable(); documentTermDataTable = documentTermTableAdapter.GetDataByDocID(rel); foreach (searchSet.Document_TermRow row in documentTermDataTable) { if (MainForm.feedback_config.useexpand == 0) { if (query.terms.ContainsKey(row.Term)) query.terms[row.Term] += row.Weight / pembagi_relevan; } else { if (query.terms.ContainsKey(row.Term)) query.terms[row.Term] += row.Weight / pembagi_relevan; else query.terms.Add(row.Term, row.Weight / pembagi_relevan); } } } foreach (int rel in irelevant_doc) { searchSetTableAdapters.Document_TermTableAdapter documentTermTableAdapter = new searchSetTableAdapters.Document_TermTableAdapter(); searchSet.Document_TermDataTable documentTermDataTable = new searchSet.Document_TermDataTable(); documentTermDataTable = documentTermTableAdapter.GetDataByDocID(rel); foreach (searchSet.Document_TermRow row in documentTermDataTable) { if (query.terms.ContainsKey(row.Term)) { query.terms[row.Term] -= row.Weight / pembagi_irelevan; if (query.terms[row.Term] <= 0) query.terms.Remove(row.Term); } } if (MainForm.feedback_config.algorithm == 2) break; } foreach (KeyValuePair<string, double> entry in query.terms) { Console.WriteLine(entry.Key + " --> " + entry.Value); } }