public static void ProcessDir(string path, Regex filePattern, TextDataCatagory catagoryHint, CorpusProcessDelegate process) { if (Directory.Exists(path)) { string[] files = Directory.GetFiles(path, "*", SearchOption.AllDirectories); foreach (string file in files) { if (file.Contains("[NOPARSE]")) { continue; } string fileName = Path.GetFileName(file); if (filePattern.Matches(fileName).Count > 0) { if (Path.GetExtension(file).ToLower() == ".zip") { ProcessZip(file, filePattern, catagoryHint, process); } else { using (Stream s = File.OpenRead(file)) { ProcessFile(s, file, catagoryHint, process); } } } } } }
public static void ProcessCorpus(string corpusPath, Regex filePattern, CorpusProcessDelegate process) { string[] dirs = Directory.GetDirectories(corpusPath, "*", SearchOption.TopDirectoryOnly); foreach (string dir in dirs) { TextDataCatagory catagory = GetCatagoryFromDir(dir); ProcessDir(dir, filePattern, catagory, process); } }
public static void ProcessFile(Stream stream, string path, TextDataCatagory catagoryHint, CorpusProcessDelegate process) { CorpusParser parser = GetCorpusParserFromPath(path); IList <TextData> data = parser.LoadData(stream, path, catagoryHint); foreach (TextData item in data) { process(item); } }
public static TextData fromParagraph(string title, string text, TextDataCatagory catagory) { TextData td = new TextData(); td.Text = text; td.Title = title; td.Type = TextDataType.Paragraph; td.Catagory = catagory; return(td); }
public static TextData fromBook(string title, string text, TextDataCatagory catagory) { TextData td = new TextData(); td.Text = text; td.Title = title; td.Type = TextDataType.EntireWork; td.Catagory = catagory; return(td); }
public override IList <TextData> LoadData(Stream stream, string path, TextDataCatagory catagoryHint) { IList <TextData> data = new List <TextData>(); using (StreamReader reader = new StreamReader(stream)) { string text = reader.ReadToEnd(); TextData td = TextData.fromBook(Path.GetFileNameWithoutExtension(path), text, catagoryHint); data.Add(td); } return(data); }
public static void ProcessZip(string path, Regex filePattern, TextDataCatagory catagoryHint, CorpusProcessDelegate process) { using (ZipArchive archive = ZipFile.OpenRead(path)) { foreach (ZipArchiveEntry entry in archive.Entries) { if (filePattern.Matches(entry.Name).Count > 0) { using (Stream s = entry.Open()) { ProcessFile(s, path + "@" + entry.FullName, catagoryHint, process); } } } } }
public override IList <TextData> LoadData(System.IO.Stream stream, string path, TextDataCatagory catagoryHint) { return(null); }
public abstract IList <TextData> LoadData(Stream stream, string path, TextDataCatagory catagoryHint);
public override IList <TextData> LoadData(System.IO.Stream stream, string path, TextDataCatagory catagoryHint) { //setup a compitent xml reader XmlReaderSettings settings = new XmlReaderSettings(); settings.DtdProcessing = DtdProcessing.Parse; settings.ValidationType = ValidationType.DTD; settings.MaxCharactersFromEntities = 64 * 1024 * 1024; XmlResolver resolver = GetXMLResolver(); if (resolver != null) { settings.XmlResolver = resolver; } XmlReader reader = XmlReader.Create(stream, settings); XmlDocument xml = new XmlDocument(); xml.Load(reader); List <TextData> items = new List <TextData>(); //do the xml parsing StringBuilder sb = new StringBuilder(); Stack <XmlNode> nodes = new Stack <XmlNode>(); nodes.Push(xml); while (nodes.Count > 0) { XmlNode currentNode = nodes.Pop(); if (useNode(currentNode)) { if (currentNode.Name == "#text") { //Console.Write(indent + node.InnerText); sb.Insert(0, currentNode.InnerText); } else { if (SplitParagraphs && isParagraphNode(currentNode)) { items.Add(TextData.fromParagraph("", sb.ToString(), catagoryHint)); sb.Clear(); } else { string s = nodeToString(currentNode); if (s != null) { sb.Insert(0, s); } } } foreach (XmlNode subNode in currentNode) { nodes.Push(subNode); } } } if (sb.Length > 3) { items.Add(TextData.fromParagraph("", sb.ToString(), catagoryHint)); } //return results return(items); }