static void Main(string[] args) { StreamReader stream = new StreamReader(args[0]); string body = stream.ReadToEnd(); MemoryManager mem = new MemoryManager(4000000, 4000000); DecodedTextClass content = new DecodedTextClass(mem, true); HTMLWikiProcessor textproc = new HTMLWikiProcessor(new HashSet<int>(), false); textproc.LoadDecodedTextClass(ref content); content.resetDecoder(); textproc.ProcessHTML(body); StreamWriter sw = new StreamWriter("words.txt"); string[] tokens = content.GetTokens(); sw.Write(string.Join(",", tokens)); sw.Close(); // if text is an html page, we can extract the title sw = new StreamWriter("title.txt"); tokens = content.GetTitleTokens(); sw.Write(string.Join(",", tokens)); sw.Close(); // if text is an html page, we can extract text only from within div's with a matching id content.resetDecoder(); // need to reset to reuse the DecodedTextClass object HashSet<string> divfilters = new HashSet<string>(); divfilters.Add("id=\"articleBody\""); divfilters.Add("class=\"articleBody\""); textproc.ProcessDivHTML(body, divfilters); sw = new StreamWriter("specificdiv.txt"); sw.Write(string.Join(",", tokens)); sw.Close(); }
static void Main(string[] args) { StreamReader stream = new StreamReader(args[0]); string body = stream.ReadToEnd(); MemoryManager mem = new MemoryManager(4000000, 4000000); DecodedTextClass content = new DecodedTextClass(mem, true); HTMLWikiProcessor textproc = new HTMLWikiProcessor(new HashSet <int>(), false); textproc.LoadDecodedTextClass(ref content); content.resetDecoder(); textproc.ProcessHTML(body); StreamWriter sw = new StreamWriter("words.txt"); string[] tokens = content.GetTokens(); sw.Write(string.Join(",", tokens)); sw.Close(); // if text is an html page, we can extract the title sw = new StreamWriter("title.txt"); tokens = content.GetTitleTokens(); sw.Write(string.Join(",", tokens)); sw.Close(); // if text is an html page, we can extract text only from within div's with a matching id content.resetDecoder(); // need to reset to reuse the DecodedTextClass object HashSet <string> divfilters = new HashSet <string>(); divfilters.Add("id=\"articleBody\""); divfilters.Add("class=\"articleBody\""); textproc.ProcessDivHTML(body, divfilters); sw = new StreamWriter("specificdiv.txt"); sw.Write(string.Join(",", tokens)); sw.Close(); }