public string start_search(string query_input) { StringBuilder to_output = new StringBuilder(); String query = query_input; String[] query_list; double[] scores = new double[docs.Count + 1]; List <string> query_tok = new List <string>(); query = query.Replace("<.*?>", " "); query = query.Replace("-", " "); query = query.Replace(", ", " "); query = query.Replace("; ", " "); query = query.Replace("\\? ", " "); query = query.Replace(": ", " "); query = query.Replace("! ", " "); query = query.Replace("\\. ", " "); query = query.Replace("\\.\"|\\.'", " "); query = query.Replace(" +", " "); query = query.Replace(" [a-z] | [A-Z] ", " "); query_list = null; query_tok.Clear(); query_list = query.Split(' '); //tokenize the input query for (int i = 0; i < query_list.Length; i++) { query_list[i] = query_list[i].ToLower(); query_list[i] = query_list[i].ToLower(); query_list[i] = query_list[i].Replace("^\\[|\\]$", ""); query_list[i] = query_list[i].Replace("^\\(|\\)$", ""); query_list[i] = query_list[i].Replace("^'|'$", ""); query_list[i] = query_list[i].Replace("'", ""); query_list[i] = query_list[i].Replace("^\"|\"$", ""); query_list[i] = query_list[i].Replace(",", ""); query_list[i] = query_list[i].Trim(); if (query_list[i].EndsWith("ies")) { if (!(query_list[i].EndsWith("aies")) && !(query_list[i].EndsWith("eies"))) { query_list[i] = query_list[i].Replace("ies$", "y"); } } if (query_list[i].EndsWith("es")) { if (!(query_list[i].EndsWith("aes")) && !(query_list[i].EndsWith("ees")) && !(query_list[i].EndsWith("oes"))) { query_list[i] = query_list[i].Replace("es$", "e"); } } if (query_list[i].EndsWith("s")) { if (!(query_list[i].EndsWith("us")) && !(query_list[i].EndsWith("ss"))) { query_list[i] = query_list[i].Replace("s$", ""); } } if ((!(query_list[i].Length == 1)) && (!query_list[i].Equals("")) && (!query_list[i].Equals("and")) && (!query_list[i].Equals("an")) && (!query_list[i].Equals("by")) && (!query_list[i].Equals("from")) && (!query_list[i].Equals("of")) && (!query_list[i].Equals("the")) && (!query_list[i].Equals("with")) && (!query_list[i].Equals("a")) && (!query_list[i].Equals("in"))) { query_list[i] = query_list[i].Replace("[^A-Za-z0-9]", ""); if ((!(query_list[i].Length == 1)) && (!query_list[i].Equals(""))) { query_tok.Add(query_list[i]); } } } List <int> docs_res = new List <int>(); foreach (string tok in query_tok) { if (dict.ContainsKey(tok)) { SearchDictionary d = dict[tok]; int[] tf = new int[docs.Count + 1]; for (int x = 0; x < d.df; x++) { Posting p = post[d.offset + x]; tf[p.docid] = p.tf; if (!docs_res.Contains(p.docid)) { docs_res.Add(p.docid); } } for (int y = 1; y < tf.Length; y++) { Docs_Info doc = docs[y]; scores[y] += Math.Log10((0.9 * ((double)tf[y] / (double)doc.doclength) + (0.1 * (double)d.cf / (double)collectionSize))) / (double)Math.Log10(2); if (docs_res.Contains(y)) { scores_m[y] = scores[y]; } } } } var top5 = scores_m.OrderByDescending(pair => pair.Value).Take(5).ToDictionary(pair => pair.Key, pair => pair.Value); int count = 0; var newline = Environment.NewLine; foreach (var x in top5) { int id = x.Key; Docs_Info doc_r = docs[id]; to_output.Append(docs[id].headline + newline + doc_r.docpath + Environment.NewLine + "Computed probability: " + x.Value + newline); to_output.Append(doc_r.snippet + newline + newline); count = count + 1; if (count == 5) { break; } } if (scores_m.Count == 0) { to_output.Append("NO RESULTs" + newline); } scores_m.Clear(); return(to_output.ToString()); }
public void IOFile_Process(String filename) { bool isDict = false; bool isPosting = false; bool isDocsTable = false; bool isTotal = false; if (filename.Equals("..\\..\\SEDocumentExtraction\\dictionary.csv")) { isDict = true; } else if (filename.Equals("..\\..\\SEDocumentExtraction\\postings.csv")) { isPosting = true; } else if (filename.Equals("..\\..\\SEDocumentExtraction\\docsTable.csv")) { isDocsTable = true; } else if (filename.Equals("..\\..\\SEDocumentExtraction\\Total.csv")) { isTotal = true; } try { int count = 0; StreamReader scanner = File.OpenText(filename); string s = String.Empty; while ((s = scanner.ReadLine()) != null) { string[] separate = s.Split(','); if (isDict && separate.Length == 4 && count > 0) { dict[separate[0]] = new SearchDictionary(int.Parse(separate[1].Trim()), int.Parse(separate[2].Trim()), int.Parse(separate[3].Trim())); } else if (isPosting && separate.Length == 2 && count > 0) { post[count - 1] = new Posting(int.Parse(separate[0].Trim()), int.Parse(separate[1].Trim())); } else if (isDocsTable && separate.Length == 5 && count > 0) { docs[int.Parse(separate[0].Trim())] = new Docs_Info(separate[1].Trim(), int.Parse(separate[2].Trim()), separate[3].Trim(), separate[4].Trim()); } else if (isTotal && count > 0) { collectionSize = int.Parse(s.Trim()); } count = count + 1; } scanner.Close(); } catch (FileNotFoundException e) { Console.WriteLine("File Not Found !!Please Enter Proper FileName !!"); return; } }