static private TopicBlocks ExtractBlocks(HtmlNode node, MainBodyDetector mbd, string[] titleTokens, TopicBlocks tbs = null, bool isRoot = true) { if (node == null) { return(null); } if (tbs == null) { tbs = new TopicBlocks(titleTokens); } if (Setting.ignoreTags.Contains(node.Name) || !mbd.isMainBody(node)) { tbs.addExtractedText("\n"); } else { Regex HeadRegex = new Regex("h[1-6]"); if (HeadRegex.IsMatch(node.Name) && node.Name.Length == 2) { //save tbs.SaveBlock(); //change header int hx = Convert.ToInt16(node.Name.Substring(1)); string[] subtopicTokens = NLPmethods.Stemming(NLPmethods.FilterOutStopWords(NLPmethods.Tokenization(WebUtility.HtmlDecode(ExtractText(node))))); tbs.addNewHeader(hx, subtopicTokens); return(tbs); } if (node.ChildNodes.Count == 0) { tbs.addExtractedText(WebUtility.HtmlDecode(node.InnerText.Replace("\n", " ").Replace("\r", " "))); } else { foreach (HtmlNode n in node.ChildNodes) { if (n.Name.Equals("a")) { tbs.addExtractedText("\n"); } if (Setting.changeLineTags.Contains(n.Name)) { tbs.addExtractedText("\n"); } ExtractBlocks(n, mbd, null, tbs, false); if (Setting.changeLineTags.Contains(n.Name)) { tbs.addExtractedText("\n"); } } } } if (isRoot) { tbs.SaveBlock(); } return(tbs); }
static void Main(string[] args) { //read query list file List <string[]> queryTokenList = new List <string[]>(); List <string> queryList = new List <string>(); StreamReader sr = new StreamReader(Setting.queryListFile); while (!sr.EndOfStream) { string query = sr.ReadLine().Substring(10); string[] queryTokens = NLPmethods.Stemming(NLPmethods.FilterOutStopWords(NLPmethods.Tokenization(query))); queryTokenList.Add(queryTokens); queryList.Add(query); } sr.Close(); //foreach html if (Directory.Exists(Setting.HTML_DirectoryPath)) { for (int qId = 1; qId <= 50; qId++) { string[] files = Directory.GetFiles(Setting.HTML_DirectoryPath, "MC-E-" + String.Format("{0:D4}", qId) + "-*.html"); int HTMLcountInThisQuestion = files.Length; List <Sentence> Q_Sens = new List <Sentence>(); List <string[]> All_Sens = new List <string[]>(); int alreadyGetSentencesFromQid = 0, fileCount = 0; for (int i = 1; alreadyGetSentencesFromQid < Setting.numOfSentencesEachQ && fileCount < HTMLcountInThisQuestion; i++) { string file = Setting.HTML_DirectoryPath + "\\MC-E-" + String.Format("{0:D4}", qId) + "-" + i + ".html"; DirectoryInfo di = new DirectoryInfo(file); if (!File.Exists(file)) { continue; } else { fileCount++; } Console.WriteLine(di.Name); sr = new StreamReader(file); string html = sr.ReadToEnd(); sr.Close(); HtmlDocument doc = new HtmlDocument(); HtmlNode.ElementsFlags.Remove("form"); doc.LoadHtml(html); HtmlNode bodyNode = doc.DocumentNode.SelectSingleNode("//body"); HtmlNode titleNode = doc.DocumentNode.SelectSingleNode("//html//head//title"); string[] titleTokens = null; if (titleNode != null) { titleTokens = NLPmethods.Stemming(NLPmethods.FilterOutStopWords(NLPmethods.Tokenization(WebUtility.HtmlDecode(titleNode.InnerText)))); } Pair <string, double>[] parseResult = null; parseResult = QA.ExtractBlocks(doc); if (parseResult == null) { MainBodyDetector mbd = new MainBodyDetector(bodyNode, Setting.thresholdT); TopicBlocks tbs = ExtractBlocks(bodyNode, mbd, titleTokens); parseResult = (tbs == null ? null : tbs.getBlocksWithWeight(queryTokenList[qId - 1].ToArray())); } Sentence[] sentences = SplitToSentences(parseResult, queryTokenList[qId - 1].ToArray(), i); string[] AllSentences = GetAllSentences(parseResult, queryTokenList[qId - 1].ToArray(), i); foreach (Sentence s in sentences) { if (alreadyGetSentencesFromQid < Setting.OutputSentencesEachQ) { s.isTop = true; } else { s.isTop = false; } Q_Sens.Add(s); alreadyGetSentencesFromQid++; } All_Sens.Add(AllSentences); } //LDA LDA lda = new LDA(Setting.topicCount); lda.training(All_Sens); lda.testing(queryTokenList[qId - 1].ToArray(), Q_Sens.ToArray()); //LexRank LexRank.getScore(Q_Sens.ToArray()); //Lucene Lucene.indexing(Q_Sens.ToArray()); Lucene.query(Q_Sens.ToArray(), queryList[qId - 1]); Q_Sens.Sort(delegate(Sentence x, Sentence y) { //double a = x.lexRank * x.logRank * x.tf * x.topicWeight; //double b = y.lexRank * y.logRank * y.tf * y.topicWeight; double a = x.lucene; double b = y.lucene; return(a.CompareTo(b) * (-1)); }); //output if (!Directory.Exists(Setting.outputDirectoryPath)) { Directory.CreateDirectory(Setting.outputDirectoryPath); } StreamWriter sw = new StreamWriter(Setting.outputDirectoryPath + @"\" + qId + ".txt"); HashSet <string> alreadyOutput = new HashSet <string>(); foreach (Sentence s in Q_Sens) { if (!alreadyOutput.Contains(s.sentnece) && s.isTop) { sw.WriteLine("sentence:\t\t\t" + s.sentnece); //sw.WriteLine("with chunker:\t\t" + s.senWithChunk); //sw.WriteLine("parser:\n" + NLPmethods.Parser(s.sentnece)); sw.WriteLine("term freq:\t\t\t" + s.tf); sw.WriteLine("search rank:\t\t" + s.searchRank); sw.WriteLine("logRank:\t\t\t" + s.logRank); sw.WriteLine("lexRank:\t\t\t" + s.lexRank); sw.WriteLine("subtopic weight:\t" + s.topicWeight); sw.WriteLine("lda:\t\t\t\t" + s.lda); sw.WriteLine("lda2:\t\t\t\t" + s.lda2); sw.WriteLine("lucene:\t\t\t\t" + s.lucene); sw.WriteLine("not stopword count:\t" + s.stemTokens.Length); sw.WriteLine("total token count:\t" + s.tokens.Length); //sw.WriteLine("total:\t\t\t\t" + (s.lexRank * s.logRank * s.tf * s.topicWeight)); sw.WriteLine("----------------------------------------------------------------"); sw.Flush(); alreadyOutput.Add(s.sentnece); } } sw.Close(); } } Console.WriteLine("Finish!"); Console.ReadKey(); }