Beispiel #1
0
        static void Main(string[] args)
        {
            StreamReader stream = new StreamReader(args[0]);
            string body = stream.ReadToEnd();
            MemoryManager mem = new MemoryManager(4000000, 4000000);
            DecodedTextClass content = new DecodedTextClass(mem, true);
            HTMLWikiProcessor textproc = new HTMLWikiProcessor(new HashSet<int>(), false);
            textproc.LoadDecodedTextClass(ref content);
            content.resetDecoder();
            textproc.ProcessHTML(body);

            StreamWriter sw = new StreamWriter("words.txt");
            string[] tokens = content.GetTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract the title
            sw = new StreamWriter("title.txt");
            tokens = content.GetTitleTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract text only from within div's with a matching id
            content.resetDecoder(); // need to reset to reuse the DecodedTextClass object
            HashSet<string> divfilters = new HashSet<string>();
            divfilters.Add("id=\"articleBody\"");
            divfilters.Add("class=\"articleBody\"");

            textproc.ProcessDivHTML(body, divfilters);
            sw = new StreamWriter("specificdiv.txt");
            sw.Write(string.Join(",", tokens));
            sw.Close();
        }
Beispiel #2
0
        static void Main(string[] args)
        {
            StreamReader      stream   = new StreamReader(args[0]);
            string            body     = stream.ReadToEnd();
            MemoryManager     mem      = new MemoryManager(4000000, 4000000);
            DecodedTextClass  content  = new DecodedTextClass(mem, true);
            HTMLWikiProcessor textproc = new HTMLWikiProcessor(new HashSet <int>(), false);

            textproc.LoadDecodedTextClass(ref content);
            content.resetDecoder();
            textproc.ProcessHTML(body);

            StreamWriter sw = new StreamWriter("words.txt");

            string[] tokens = content.GetTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract the title
            sw     = new StreamWriter("title.txt");
            tokens = content.GetTitleTokens();
            sw.Write(string.Join(",", tokens));
            sw.Close();

            // if text is an html page, we can extract text only from within div's with a matching id
            content.resetDecoder(); // need to reset to reuse the DecodedTextClass object
            HashSet <string> divfilters = new HashSet <string>();

            divfilters.Add("id=\"articleBody\"");
            divfilters.Add("class=\"articleBody\"");

            textproc.ProcessDivHTML(body, divfilters);
            sw = new StreamWriter("specificdiv.txt");
            sw.Write(string.Join(",", tokens));
            sw.Close();
        }