C# (CSharp) HTMLtoContent TopicBlocks Exemples

Langage de programmation: C# (CSharp)

Espace de nommage/Pack: HTMLtoContent

Class/Type: TopicBlocks

Exemples au hotexamples.com: 2

C# (CSharp) HTMLtoContent TopicBlocks - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de HTMLtoContent.TopicBlocks extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

getBlocksWithWeight(1)

Méthodes fréquemment utilisées

getBlocksWithWeight (1)

Associées

INode

ServiceManeger

EmailService.Business.Entities.EmailMessage

JDApi

PlacePicture

AutoTFOutput

UpDnEngine

NegationNode

Website

BankUIManager

Related in langs

dataReduce8 (PHP)

nprstory_show_post_types_select (PHP)

SDA (C++)

mkdir_p_internal (C++)

Getter (Go)

Decrypt (Go)

PzrPanel (Java)

SolrUtils (Java)

response (Python)

Observable (Python)

Exemple #1

0

Afficher le fichier

Fichier : Program.cs Projet : tsaiian/ExtractSentencesFromHTML

static private TopicBlocks ExtractBlocks(HtmlNode node, MainBodyDetector mbd, string[] titleTokens, TopicBlocks tbs = null, bool isRoot = true) { if (node == null) { return(null); } if (tbs == null) { tbs = new TopicBlocks(titleTokens); } if (Setting.ignoreTags.Contains(node.Name) || !mbd.isMainBody(node)) { tbs.addExtractedText("\n"); } else { Regex HeadRegex = new Regex("h[1-6]"); if (HeadRegex.IsMatch(node.Name) && node.Name.Length == 2) { //save tbs.SaveBlock(); //change header int hx = Convert.ToInt16(node.Name.Substring(1)); string[] subtopicTokens = NLPmethods.Stemming(NLPmethods.FilterOutStopWords(NLPmethods.Tokenization(WebUtility.HtmlDecode(ExtractText(node))))); tbs.addNewHeader(hx, subtopicTokens); return(tbs); } if (node.ChildNodes.Count == 0) { tbs.addExtractedText(WebUtility.HtmlDecode(node.InnerText.Replace("\n", " ").Replace("\r", " "))); } else { foreach (HtmlNode n in node.ChildNodes) { if (n.Name.Equals("a")) { tbs.addExtractedText("\n"); } if (Setting.changeLineTags.Contains(n.Name)) { tbs.addExtractedText("\n"); } ExtractBlocks(n, mbd, null, tbs, false); if (Setting.changeLineTags.Contains(n.Name)) { tbs.addExtractedText("\n"); } } } } if (isRoot) { tbs.SaveBlock(); } return(tbs); }

Exemple #2

0

Afficher le fichier

Fichier : Program.cs Projet : tsaiian/ExtractSentencesFromHTML

static void Main(string[] args) { //read query list file List <string[]> queryTokenList = new List <string[]>(); List <string> queryList = new List <string>(); StreamReader sr = new StreamReader(Setting.queryListFile); while (!sr.EndOfStream) { string query = sr.ReadLine().Substring(10); string[] queryTokens = NLPmethods.Stemming(NLPmethods.FilterOutStopWords(NLPmethods.Tokenization(query))); queryTokenList.Add(queryTokens); queryList.Add(query); } sr.Close(); //foreach html if (Directory.Exists(Setting.HTML_DirectoryPath)) { for (int qId = 1; qId <= 50; qId++) { string[] files = Directory.GetFiles(Setting.HTML_DirectoryPath, "MC-E-" + String.Format("{0:D4}", qId) + "-*.html"); int HTMLcountInThisQuestion = files.Length; List <Sentence> Q_Sens = new List <Sentence>(); List <string[]> All_Sens = new List <string[]>(); int alreadyGetSentencesFromQid = 0, fileCount = 0; for (int i = 1; alreadyGetSentencesFromQid < Setting.numOfSentencesEachQ && fileCount < HTMLcountInThisQuestion; i++) { string file = Setting.HTML_DirectoryPath + "\\MC-E-" + String.Format("{0:D4}", qId) + "-" + i + ".html"; DirectoryInfo di = new DirectoryInfo(file); if (!File.Exists(file)) { continue; } else { fileCount++; } Console.WriteLine(di.Name); sr = new StreamReader(file); string html = sr.ReadToEnd(); sr.Close(); HtmlDocument doc = new HtmlDocument(); HtmlNode.ElementsFlags.Remove("form"); doc.LoadHtml(html); HtmlNode bodyNode = doc.DocumentNode.SelectSingleNode("//body"); HtmlNode titleNode = doc.DocumentNode.SelectSingleNode("//html//head//title"); string[] titleTokens = null; if (titleNode != null) { titleTokens = NLPmethods.Stemming(NLPmethods.FilterOutStopWords(NLPmethods.Tokenization(WebUtility.HtmlDecode(titleNode.InnerText)))); } Pair <string, double>[] parseResult = null; parseResult = QA.ExtractBlocks(doc); if (parseResult == null) { MainBodyDetector mbd = new MainBodyDetector(bodyNode, Setting.thresholdT); TopicBlocks tbs = ExtractBlocks(bodyNode, mbd, titleTokens); parseResult = (tbs == null ? null : tbs.getBlocksWithWeight(queryTokenList[qId - 1].ToArray())); } Sentence[] sentences = SplitToSentences(parseResult, queryTokenList[qId - 1].ToArray(), i); string[] AllSentences = GetAllSentences(parseResult, queryTokenList[qId - 1].ToArray(), i); foreach (Sentence s in sentences) { if (alreadyGetSentencesFromQid < Setting.OutputSentencesEachQ) { s.isTop = true; } else { s.isTop = false; } Q_Sens.Add(s); alreadyGetSentencesFromQid++; } All_Sens.Add(AllSentences); } //LDA LDA lda = new LDA(Setting.topicCount); lda.training(All_Sens); lda.testing(queryTokenList[qId - 1].ToArray(), Q_Sens.ToArray()); //LexRank LexRank.getScore(Q_Sens.ToArray()); //Lucene Lucene.indexing(Q_Sens.ToArray()); Lucene.query(Q_Sens.ToArray(), queryList[qId - 1]); Q_Sens.Sort(delegate(Sentence x, Sentence y) { //double a = x.lexRank * x.logRank * x.tf * x.topicWeight; //double b = y.lexRank * y.logRank * y.tf * y.topicWeight; double a = x.lucene; double b = y.lucene; return(a.CompareTo(b) * (-1)); }); //output if (!Directory.Exists(Setting.outputDirectoryPath)) { Directory.CreateDirectory(Setting.outputDirectoryPath); } StreamWriter sw = new StreamWriter(Setting.outputDirectoryPath + @"\" + qId + ".txt"); HashSet <string> alreadyOutput = new HashSet <string>(); foreach (Sentence s in Q_Sens) { if (!alreadyOutput.Contains(s.sentnece) && s.isTop) { sw.WriteLine("sentence:\t\t\t" + s.sentnece); //sw.WriteLine("with chunker:\t\t" + s.senWithChunk); //sw.WriteLine("parser:\n" + NLPmethods.Parser(s.sentnece)); sw.WriteLine("term freq:\t\t\t" + s.tf); sw.WriteLine("search rank:\t\t" + s.searchRank); sw.WriteLine("logRank:\t\t\t" + s.logRank); sw.WriteLine("lexRank:\t\t\t" + s.lexRank); sw.WriteLine("subtopic weight:\t" + s.topicWeight); sw.WriteLine("lda:\t\t\t\t" + s.lda); sw.WriteLine("lda2:\t\t\t\t" + s.lda2); sw.WriteLine("lucene:\t\t\t\t" + s.lucene); sw.WriteLine("not stopword count:\t" + s.stemTokens.Length); sw.WriteLine("total token count:\t" + s.tokens.Length); //sw.WriteLine("total:\t\t\t\t" + (s.lexRank * s.logRank * s.tf * s.topicWeight)); sw.WriteLine("----------------------------------------------------------------"); sw.Flush(); alreadyOutput.Add(s.sentnece); } } sw.Close(); } } Console.WriteLine("Finish!"); Console.ReadKey(); }