public /*protected*/ override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Html") { return; } try { HtmlTokenizer htmlTokenizer = new HtmlTokenizer(document.Text, /*stemmer=*/ null, /*decode=*/ true, /*tokenize=*/ false, /*applySkipRules=*/ true); int idx = 0; ArrayList <string> txtBlocks = new ArrayList <string>(); bool merge = false; for (HtmlTokenizer.Enumerator e = (HtmlTokenizer.Enumerator)htmlTokenizer.GetEnumerator(); e.MoveNext();) { if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.Text) { string textBlock = Utils.ToOneLine(e.Current.Trim(), /*compact=*/ true); if (textBlock != "") { if (!merge) { txtBlocks.Add(textBlock); document.AddAnnotation(new Annotation(idx, idx + textBlock.Length - 1, "TextBlock")); } else { idx--; txtBlocks.Last += " " + textBlock; int oldStartIdx = document.GetAnnotationAt(document.AnnotationCount - 1).SpanStart; document.RemoveAnnotationAt(document.AnnotationCount - 1); document.AddAnnotation(new Annotation(oldStartIdx, idx + textBlock.Length - 1, "TextBlock")); } idx += textBlock.Length + 2; merge = true; } } else { if (mTagKeepList.Contains(e.CurrentToken.TagName.ToLower())) { merge = false; } } } StringBuilder sb = new StringBuilder(); foreach (string textBlock in txtBlocks) { sb.AppendLine(textBlock); } document.Text = sb.ToString(); document.Features.SetFeatureValue("contentType", "Text"); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
public /*protected*/ override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Html") { return; } try { List <BoilerplateRemover.HtmlBlock> blocks; mBoilerplateRemover.ExtractText(new StringReader(document.Text), BoilerplateRemover.TextClass.Unknown, out blocks); StringBuilder text = new StringBuilder(); foreach (BoilerplateRemover.HtmlBlock block in blocks) { int spanStart = text.Length; string blockTxt = block.text; if (blockTxt != null && blockTxt.Length > 0) { document.AddAnnotation(new Annotation(spanStart, spanStart + (blockTxt.Length - 1), "TextBlock/" + block.textClass.ToString())); text.AppendLine(blockTxt); } } document.Text = text.ToString(); document.Features.SetFeatureValue("contentType", "Text"); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
/*protected*/ public override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { return; } try { TextBlock[] blocks = document.GetAnnotatedBlocks(mBlockSelector); foreach (TextBlock block in blocks) { OpenNLP.Tools.Util.Pair<int, int>[] positions; string[] sentences = mSentenceDetector.SentenceDetect(block.Text, out positions); int i = 0; foreach (OpenNLP.Tools.Util.Pair<int, int> pos in positions) { int startTrimOffset, endTrimOffset; GetTrimOffsets(sentences[i], out startTrimOffset, out endTrimOffset); int startIdx = block.SpanStart + pos.FirstValue + startTrimOffset; int endIdx = block.SpanStart + pos.FirstValue + (pos.SecondValue - 1) - endTrimOffset; if (endIdx >= startIdx) { document.AddAnnotation(new Annotation(startIdx, endIdx, "Sentence")); } i++; } } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
public static void SaveHtml(string[] featureNames, SparseVector <double> vec, XmlDocument xmlDoc, ArrayList <Chunk> chunks, string fileName) { Document doc = new Document(xmlDoc.SelectSingleNode("//header/naslov").InnerText, ""); StringBuilder txt = new StringBuilder(); XmlNodeList nodes = xmlDoc.SelectNodes("//text/body//p/s"); foreach (XmlNode node in nodes) // for each sentence... { foreach (XmlNode wordNode in node.SelectNodes("w | c | S")) { if (wordNode.Name == "S") { txt.Append(" "); } else { string str = wordNode.InnerText; int spanStart = txt.Length; int spanEnd = spanStart + str.Length - 1; txt.Append(str); Annotation a = new Annotation(spanStart, spanEnd, wordNode.Name == "w" ? "beseda" : "ločilo"); if (wordNode.Name == "w") { a.Features.SetFeatureValue("oznaka", wordNode.Attributes["msd"].Value); a.Features.SetFeatureValue("lema", wordNode.Attributes["lemma"].Value); } doc.AddAnnotation(a); } } txt.AppendLine(); } txt.AppendLine(); txt.AppendLine("Rezultat členitve:"); txt.AppendLine(); foreach (ChunkType chunkType in new ChunkType[] { ChunkType.VP, ChunkType.NP, ChunkType.PP, ChunkType.AP, ChunkType.CON, ChunkType.Other }) { string chunkTypeStr = chunkType.ToString(); if (chunkTypeStr == "Other") { chunkTypeStr = "Ostalo"; } txt.AppendLine(chunkTypeStr + ":"); foreach (Chunk chunk in chunks.Where(x => x.mType == chunkType)) { txt.AppendLine("\t" + chunk.ToString()); } } doc.Text = txt.ToString(); int i = 0; foreach (string featureName in featureNames) { doc.Features.SetFeatureValue(featureName, vec[i++].ToString()); } using (StreamWriter w = new StreamWriter(fileName, /*append=*/ false, Encoding.UTF8)) { doc.MakeHtmlPage(w, /*inlineCss=*/ true); } }
/// <summary> /// Returns a full C# syntax tree resolver which is shared between semantic highlighting, source analysis and refactoring. /// For code analysis tasks this should be used instead of generating an own resolver. Only exception is if a local resolving is done using a /// resolve navigator. /// Note: The shared resolver is fully resolved. /// </summary> public static TaskWrapper GetSharedResolver(this Document document) { var parsedDocument = document.ParsedDocument; if (parsedDocument == null || document.IsProjectContextInUpdate) { return(null); } var unit = parsedDocument.GetAst <SyntaxTree> (); var parsedFile = parsedDocument.ParsedFile as CSharpUnresolvedFile; if (unit == null || parsedFile == null) { return(null); } var compilation = document.Compilation; var resolverAnnotation = document.Annotation <ResolverAnnotation> (); if (resolverAnnotation != null) { if (resolverAnnotation.ParsedFile == parsedFile) { return(resolverAnnotation.Task); } if (resolverAnnotation.SharedTokenSource != null) { resolverAnnotation.SharedTokenSource.Cancel(); } document.RemoveAnnotations <ResolverAnnotation> (); } var tokenSource = new CancellationTokenSource(); var token = tokenSource.Token; var resolveTask = Task.Factory.StartNew(delegate { try { using (var timer = ResolveCounter.BeginTiming()) { var result = new CSharpAstResolver(compilation, unit, parsedFile); result.ApplyNavigator(new ConstantModeResolveVisitorNavigator(ResolveVisitorNavigationMode.Resolve, null), token); return(result); } } catch (OperationCanceledException) { return(null); } catch (Exception e) { LoggingService.LogError("Error while creating the resolver.", e); return(null); } }, token); var wrapper = new TaskWrapper(resolveTask); document.AddAnnotation(new ResolverAnnotation { Task = wrapper, ParsedFile = parsedFile, SharedTokenSource = tokenSource }); return(wrapper); }
static void Main(string[] args) { Console.WriteLine("hello worlds!"); //X x = new X(); //A a = new A(); //B b = new B(); //C c = new C(); //D d = new D(); //GenericStreamDataConsumer gsdc = new GenericStreamDataConsumer(); //gsdc.OnConsumeData += delegate(IDataProducer sender, object data) //{ // Console.WriteLine((string)data); //}; //Y y = new Y(); //x.Subscribe(a); //a.Subscribe(b); //b.Subscribe(gsdc); //x.Subscribe(c); //c.Subscribe(d); //d.Subscribe(gsdc); //x.Start(); //Console.ReadLine(); //Console.WriteLine("stop"); //x.GracefulStop(); //Console.ReadLine(); //DocumentCorpus corpus = new DocumentCorpus(); //Document doc = new Document("This is a very short document. This is some boilerplate."); //corpus.Add(doc); //Annotation annot = new Annotation(0, 29, "content_block"); ////doc.AddAnnotation(annot); //RegexTokenizerComponent tok = new RegexTokenizerComponent(); //tok.ReceiveData(null, corpus); //Regex mCharsetRegex // = new Regex(@"((charset)|(encoding))\s*=\s*(([""'](?<enc>[^""']+)[""'])|((?<enc>[^\s>""']+)))", RegexOptions.Compiled | RegexOptions.IgnoreCase); //Console.WriteLine(mCharsetRegex.Match(@"<?xml version=""1.0"" encoding=""ISO-8859-1""?>").Success); //RssFeedComponent rss = new RssFeedComponent(@"http://feeds.abcnews.com/abcnews/moneyheadlines"); //rss.Start(); Document doc = new Document("name", "bla bla"); Document doc2 = new Document("name2", "bla bla 2"); doc.AddAnnotation(new Annotation(0, 100, "waka waka")); StringWriter sw; XmlTextWriter writer = new XmlTextWriter(sw = new StringWriter()); DocumentCorpus c = new DocumentCorpus(); c.AddDocument(doc); c.AddDocument(doc2); c.WriteXml(writer); Console.WriteLine(sw); }
public override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { return; } try { document.CreateAnnotationIndex(); EntityRecognitionEngine.Document erDoc = new EntityRecognitionEngine.Document(); foreach (TextBlock tb in document.GetAnnotatedBlocks(mBlockSelector)) { erDoc.BeginNewTextBlock(); foreach (TextBlock s in document.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector hardcoded { ArrayList <string> tokens = new ArrayList <string>(); ArrayList <string> posTags = new ArrayList <string>(); ArrayList <int> spanInfo = new ArrayList <int>(); foreach (TextBlock token in document.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd)) // *** token selector hardcoded { tokens.Add(token.Text); posTags.Add(token.Annotation.Features.GetFeatureValue("posTag")); // *** POS tag feature name hardcoded spanInfo.Add(token.SpanStart); } erDoc.AddSentence(tokens, spanInfo, posTags); } } ArrayList <Pair <int, int> > spans; ArrayList <string> entities = erDoc.DiscoverEntities(mEntityRecognitionEngine, out spans); int i = 0; foreach (string gazetteerUri in entities) { string instanceUri = mEntityRecognitionEngine.GetIdentifiedInstance(gazetteerUri); if (instanceUri != null) { string annotationName = GetAnnotationName(mEntityRecognitionEngine.GetInstanceClassPath(instanceUri)); Annotation annotation = new Annotation(spans[i].First, spans[i].Second, annotationName); document.AddAnnotation(annotation); annotation.Features.SetFeatureValue("gazetteerUri", gazetteerUri); annotation.Features.SetFeatureValue("instanceUri", instanceUri); annotation.Features.SetFeatureValue("instanceClassUri", mEntityRecognitionEngine.GetInstanceClass(instanceUri)); // TODO: instanceLabel, instanceClassLabel } i++; } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
protected override void ProcessDocument(Document document) { TextBlock[] textBlocks = document.GetAnnotatedBlocks(SRC_ANNOT_TYPE); foreach (TextBlock textBlock in textBlocks) { // do tokenization, add annotations to document mTokenizer.Text = textBlock.Text; for (RegexTokenizer.Enumerator e = (RegexTokenizer.Enumerator)mTokenizer.GetEnumerator(); e.MoveNext(); ) { //Console.WriteLine("{0} {1} {2}", textBlock.SpanStart + e.CurrentTokenIdx, textBlock.SpanStart + e.CurrentTokenIdx + e.Current.Length - 1, e.Current); Annotation annot = new Annotation(textBlock.SpanStart + e.CurrentTokenIdx, textBlock.SpanStart + e.CurrentTokenIdx + e.Current.Length - 1, DEST_ANNOT_TYPE); document.AddAnnotation(annot); } } }
/// <summary> /// Returns a full C# syntax tree resolver which is shared between semantic highlighting, source analysis and refactoring. /// For code analysis tasks this should be used instead of generating an own resolver. Only exception is if a local resolving is done using a /// resolve navigator. /// Note: The shared resolver is fully resolved. /// </summary> public static Task <CSharpAstResolver> GetSharedResolver(this Document document) { var parsedDocument = document.ParsedDocument; if (parsedDocument == null) { return(null); } var unit = parsedDocument.GetAst <SyntaxTree> (); var parsedFile = parsedDocument.ParsedFile as CSharpUnresolvedFile; if (unit == null || parsedFile == null) { return(null); } var compilation = document.Compilation; var resolverAnnotation = document.Annotation <ResolverAnnotation> (); if (resolverAnnotation != null) { if (resolverAnnotation.ParsedFile == parsedFile) { return(resolverAnnotation.Task); } document.RemoveAnnotations <ResolverAnnotation> (); } var resolveTask = Task.Factory.StartNew(delegate { var result = new CSharpAstResolver(compilation, unit, parsedFile); result.ApplyNavigator(new ConstantModeResolveVisitorNavigator(ResolveVisitorNavigationMode.Resolve, null)); return(result); }); document.AddAnnotation(new ResolverAnnotation { Task = resolveTask, ParsedFile = parsedFile }); return(resolveTask); }
/*protected*/ public override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { return; } try { TextBlock[] textBlocks = document.GetAnnotatedBlocks(mBlockSelector); foreach (TextBlock textBlock in textBlocks) { mTokenizer.Text = textBlock.Text; for (RegexTokenizer.Enumerator e = (RegexTokenizer.Enumerator)mTokenizer.GetEnumerator(); e.MoveNext();) { document.AddAnnotation(new Annotation(textBlock.SpanStart + e.CurrentTokenIdx, textBlock.SpanStart + e.CurrentTokenIdx + e.Current.Length - 1, "Token")); } } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
public /*protected*/ override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Html") { return; } try { HtmlTokenizer htmlTokenizer = new HtmlTokenizer(document.Text, /*stemmer=*/ null, /*decode=*/ true, /*tokenize=*/ false, /*applySkipRules=*/ true); int idx = 0; ArrayList <string> txtBlocks = new ArrayList <string>(); bool merge = false; Stack <string> tags = new Stack <string>(); for (HtmlTokenizer.Enumerator e = (HtmlTokenizer.Enumerator)htmlTokenizer.GetEnumerator(); e.MoveNext();) { if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.Text) { string textBlock = Utils.ToOneLine(e.Current.Trim(), /*compact=*/ true); if (textBlock != "") { string domPath = tags.Aggregate((x, y) => y + "/" + x); bool isLink = tags.Contains("a"); if (!merge) { txtBlocks.Add(textBlock); document.AddAnnotation(new Annotation(idx, idx + textBlock.Length - 1, "TextBlock")); document.Annotations.Last.Features.SetFeatureValue("domPath", domPath); document.Annotations.Last.Features.SetFeatureValue("linkToTextRatio", string.Format("{0}/{1}", isLink ? textBlock.Length : 0, textBlock.Length)); } else { idx--; txtBlocks.Last += " " + textBlock; int oldStartIdx = document.GetAnnotationAt(document.AnnotationCount - 1).SpanStart; string oldDomPath = document.Annotations.Last.Features.GetFeatureValue("domPath"); string oldLinkToTextRatio = document.Annotations.Last.Features.GetFeatureValue("linkToTextRatio"); document.RemoveAnnotationAt(document.AnnotationCount - 1); document.AddAnnotation(new Annotation(oldStartIdx, idx + textBlock.Length - 1, "TextBlock")); document.Annotations.Last.Features.SetFeatureValue("domPath", domPath.Length < oldDomPath.Length ? domPath : oldDomPath); int linkCharCount = Convert.ToInt32(oldLinkToTextRatio.Split('/')[0]) + (isLink ? textBlock.Length : 0); int textCharCount = Convert.ToInt32(oldLinkToTextRatio.Split('/')[1]) + textBlock.Length; document.Annotations.Last.Features.SetFeatureValue("linkToTextRatio", string.Format("{0}/{1}", linkCharCount, textCharCount)); } idx += textBlock.Length + 2; merge = true; } } else { string tagName = e.CurrentToken.TagName.ToLower(); if (mSplitTags.Contains(tagName)) { merge = false; } if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.StartTag) { tags.Push(tagName); } else if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.EndTag) { string endTagName = null; if (tags.Count == 0 || (endTagName = tags.Pop()) != tagName) { mLogger.Error("ProcessDocument", "End tag does not match start tag (found {0} instead of {1}).", endTagName == null ? "nothing" : endTagName, tagName); tags.Push(endTagName); } } } } StringBuilder sb = new StringBuilder(); foreach (string textBlock in txtBlocks) { sb.AppendLine(textBlock); } document.Text = sb.ToString(); document.Features.SetFeatureValue("contentType", "Text"); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
/// <summary> /// Returns a full C# syntax tree resolver which is shared between semantic highlighting, source analysis and refactoring. /// For code analysis tasks this should be used instead of generating an own resolver. Only exception is if a local resolving is done using a /// resolve navigator. /// Note: The shared resolver is fully resolved. /// </summary> public static Task <CSharpAstResolver> GetSharedResolver(this Document document) { var parsedDocument = document.ParsedDocument; if (parsedDocument == null || document.IsProjectContextInUpdate || document.Project != null && !(document.Project is DotNetProject)) { return(null); } var unit = parsedDocument.GetAst <SyntaxTree> (); var parsedFile = parsedDocument.ParsedFile as CSharpUnresolvedFile; if (unit == null || parsedFile == null) { return(null); } var compilation = document.Compilation; var resolverAnnotation = document.Annotation <ResolverAnnotation> (); if (resolverAnnotation != null) { if (resolverAnnotation.ParsedFile == parsedFile) { return(resolverAnnotation.Task); } if (resolverAnnotation.SharedTokenSource != null) { resolverAnnotation.SharedTokenSource.Cancel(); } document.RemoveAnnotations <ResolverAnnotation> (); } var tokenSource = new CancellationTokenSource(); var token = tokenSource.Token; var resolveTask = Task.Factory.StartNew(delegate { try { using (var timer = ResolveCounter.BeginTiming()) { var result = new CSharpAstResolver(compilation, unit, parsedFile); result.ApplyNavigator(new ConstantModeResolveVisitorNavigator(ResolveVisitorNavigationMode.Resolve, null), token); return(result); } } catch (OperationCanceledException) { return(null); } catch (Exception e) { LoggingService.LogError("Error while creating the resolver.", e); return(null); } }, token); var wrapper = resolveTask.ContinueWith(t => { if (t.IsCanceled) { return(null); } if (t.IsFaulted) { var ex = t.Exception.Flatten().InnerException; if (!(ex is TaskCanceledException)) { LoggingService.LogWarning("Exception while getting shared AST resolver.", ex); } return(null); } return(t.Result); }, TaskContinuationOptions.ExecuteSynchronously); document.AddAnnotation(new ResolverAnnotation { Task = wrapper, ParsedFile = parsedFile, SharedTokenSource = tokenSource }); return(wrapper); }
public override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { return; } try { document.CreateAnnotationIndex(); EntityRecognitionEngine.Document erDoc = new EntityRecognitionEngine.Document(); foreach (TextBlock tb in document.GetAnnotatedBlocks(mBlockSelector)) { erDoc.BeginNewTextBlock(); foreach (TextBlock s in document.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector hardcoded { ArrayList<string> tokens = new ArrayList<string>(); ArrayList<string> posTags = new ArrayList<string>(); ArrayList<int> spanInfo = new ArrayList<int>(); foreach (TextBlock token in document.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd)) // *** token selector hardcoded { tokens.Add(token.Text); posTags.Add(token.Annotation.Features.GetFeatureValue("posTag")); // *** POS tag feature name hardcoded spanInfo.Add(token.SpanStart); } erDoc.AddSentence(tokens, spanInfo, posTags); } } ArrayList<Pair<int, int>> spans; ArrayList<string> entities = erDoc.DiscoverEntities(mEntityRecognitionEngine, out spans); int i = 0; foreach (string gazetteerUri in entities) { string instanceUri = mEntityRecognitionEngine.GetIdentifiedInstance(gazetteerUri); if (instanceUri != null) { string annotationName = GetAnnotationName(mEntityRecognitionEngine.GetInstanceClassPath(instanceUri)); Annotation annotation = new Annotation(spans[i].First, spans[i].Second, annotationName); document.AddAnnotation(annotation); annotation.Features.SetFeatureValue("gazetteerUri", gazetteerUri); annotation.Features.SetFeatureValue("instanceUri", instanceUri); annotation.Features.SetFeatureValue("instanceClassUri", mEntityRecognitionEngine.GetInstanceClass(instanceUri)); // TODO: instanceLabel, instanceClassLabel } i++; } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }