Пример #1
0
        public static void ProcessDir(string path, Regex filePattern, TextDataCatagory catagoryHint, CorpusProcessDelegate process)
        {
            if (Directory.Exists(path))
            {
                string[] files = Directory.GetFiles(path, "*", SearchOption.AllDirectories);
                foreach (string file in files)
                {
                    if (file.Contains("[NOPARSE]"))
                    {
                        continue;
                    }

                    string fileName = Path.GetFileName(file);
                    if (filePattern.Matches(fileName).Count > 0)
                    {
                        if (Path.GetExtension(file).ToLower() == ".zip")
                        {
                            ProcessZip(file, filePattern, catagoryHint, process);
                        }
                        else
                        {
                            using (Stream s = File.OpenRead(file))
                            {
                                ProcessFile(s, file, catagoryHint, process);
                            }
                        }
                    }
                }
            }
        }
Пример #2
0
 public static void ProcessCorpus(string corpusPath, Regex filePattern, CorpusProcessDelegate process)
 {
     string[] dirs = Directory.GetDirectories(corpusPath, "*", SearchOption.TopDirectoryOnly);
     foreach (string dir in dirs)
     {
         TextDataCatagory catagory = GetCatagoryFromDir(dir);
         ProcessDir(dir, filePattern, catagory, process);
     }
 }
Пример #3
0
        public static void ProcessFile(Stream stream, string path, TextDataCatagory catagoryHint, CorpusProcessDelegate process)
        {
            CorpusParser     parser = GetCorpusParserFromPath(path);
            IList <TextData> data   = parser.LoadData(stream, path, catagoryHint);

            foreach (TextData item in data)
            {
                process(item);
            }
        }
Пример #4
0
        public static TextData fromParagraph(string title, string text, TextDataCatagory catagory)
        {
            TextData td = new TextData();

            td.Text     = text;
            td.Title    = title;
            td.Type     = TextDataType.Paragraph;
            td.Catagory = catagory;

            return(td);
        }
Пример #5
0
        public static TextData fromBook(string title, string text, TextDataCatagory catagory)
        {
            TextData td = new TextData();

            td.Text     = text;
            td.Title    = title;
            td.Type     = TextDataType.EntireWork;
            td.Catagory = catagory;

            return(td);
        }
Пример #6
0
        public override IList <TextData> LoadData(Stream stream, string path, TextDataCatagory catagoryHint)
        {
            IList <TextData> data = new List <TextData>();

            using (StreamReader reader = new StreamReader(stream))
            {
                string   text = reader.ReadToEnd();
                TextData td   = TextData.fromBook(Path.GetFileNameWithoutExtension(path), text, catagoryHint);

                data.Add(td);
            }

            return(data);
        }
Пример #7
0
 public static void ProcessZip(string path, Regex filePattern, TextDataCatagory catagoryHint, CorpusProcessDelegate process)
 {
     using (ZipArchive archive = ZipFile.OpenRead(path))
     {
         foreach (ZipArchiveEntry entry in archive.Entries)
         {
             if (filePattern.Matches(entry.Name).Count > 0)
             {
                 using (Stream s = entry.Open())
                 {
                     ProcessFile(s, path + "@" + entry.FullName, catagoryHint, process);
                 }
             }
         }
     }
 }
Пример #8
0
 public override IList <TextData> LoadData(System.IO.Stream stream, string path, TextDataCatagory catagoryHint)
 {
     return(null);
 }
Пример #9
0
 public abstract IList <TextData> LoadData(Stream stream, string path, TextDataCatagory catagoryHint);
Пример #10
0
        public override IList <TextData> LoadData(System.IO.Stream stream, string path, TextDataCatagory catagoryHint)
        {
            //setup a compitent xml reader
            XmlReaderSettings settings = new XmlReaderSettings();

            settings.DtdProcessing             = DtdProcessing.Parse;
            settings.ValidationType            = ValidationType.DTD;
            settings.MaxCharactersFromEntities = 64 * 1024 * 1024;

            XmlResolver resolver = GetXMLResolver();

            if (resolver != null)
            {
                settings.XmlResolver = resolver;
            }

            XmlReader reader = XmlReader.Create(stream, settings);

            XmlDocument xml = new XmlDocument();

            xml.Load(reader);

            List <TextData> items = new List <TextData>();

            //do the xml parsing
            StringBuilder sb = new StringBuilder();

            Stack <XmlNode> nodes = new Stack <XmlNode>();

            nodes.Push(xml);

            while (nodes.Count > 0)
            {
                XmlNode currentNode = nodes.Pop();
                if (useNode(currentNode))
                {
                    if (currentNode.Name == "#text")
                    {
                        //Console.Write(indent + node.InnerText);
                        sb.Insert(0, currentNode.InnerText);
                    }
                    else
                    {
                        if (SplitParagraphs && isParagraphNode(currentNode))
                        {
                            items.Add(TextData.fromParagraph("", sb.ToString(), catagoryHint));
                            sb.Clear();
                        }
                        else
                        {
                            string s = nodeToString(currentNode);
                            if (s != null)
                            {
                                sb.Insert(0, s);
                            }
                        }
                    }

                    foreach (XmlNode subNode in currentNode)
                    {
                        nodes.Push(subNode);
                    }
                }
            }

            if (sb.Length > 3)
            {
                items.Add(TextData.fromParagraph("", sb.ToString(), catagoryHint));
            }
            //return results
            return(items);
        }