static private Indexable MemberNodeToIndexable(XmlNode node, Uri base_uri, string parentName) { char memberType = MemberTypeToChar(node.SelectSingleNode("MemberType").InnerText); StringBuilder memberFullName = new StringBuilder(); memberFullName.Append(memberType + ":" + parentName); if (memberType != 'C') { memberFullName.Append("." + node.Attributes["MemberName"].Value); } if (memberType == 'C' || memberType == 'M' || memberType == 'E') { memberFullName.Append("("); bool inside = false; foreach (XmlNode parameter in node.SelectNodes("Parameters/Parameter")) { if (!inside) { inside = true; } else { memberFullName.Append(","); } memberFullName.Append(parameter.Attributes["Type"].Value); } memberFullName.Append(")"); } Indexable indexable = new Indexable(UriFu.AddFragment(base_uri, memberFullName.ToString(), false)); indexable.MimeType = "text/html"; indexable.HitType = "MonodocEntry"; indexable.AddProperty(Property.New("dc:title", memberFullName.ToString())); indexable.AddProperty(Property.New("fixme:name", memberFullName.ToString())); indexable.AddProperty(Property.NewUnsearched("fixme:type", node.SelectSingleNode("MemberType").InnerText.ToLower())); StringReader reader = new StringReader(node.SelectSingleNode("Docs").InnerXml); indexable.SetTextReader(reader); return(indexable); }
static private Indexable TypeNodeToIndexable(XmlNode node, Uri base_uri) { string fragment = "T:" + node.Attributes ["FullName"].Value; Indexable indexable = new Indexable(UriFu.AddFragment(base_uri, fragment, false)); indexable.MimeType = "text/html"; indexable.HitType = "MonodocEntry"; indexable.AddProperty(Property.New("dc:title", "T:" + node.Attributes["FullName"].Value)); indexable.AddProperty(Property.NewUnsearched("fixme:name", "T:" + node.Attributes["FullName"].Value)); indexable.AddProperty(Property.NewUnsearched("fixme:type", "type")); StringReader reader = new StringReader(node.SelectSingleNode("Docs").InnerXml); indexable.SetTextReader(reader); return(indexable); }
static int DumpOneIndex_Metadata(string index_name, ArrayList uris, bool show_properties) { LuceneQueryingDriver driver; driver = new LuceneQueryingDriver(index_name, -1, true); Hashtable all_hits_by_uri = null; ArrayList all_hits = null; if (uris.Count == 0 || index_name == "FileSystemIndex") { all_hits_by_uri = driver.GetAllHitsByUri(); all_hits = new ArrayList(all_hits_by_uri.Values); } // A hard-wired hack if (index_name == "FileSystemIndex") { foreach (Hit hit in all_hits) { string internal_uri; if (hit [Property.IsChildPropKey] == "true") { string path = RemapUriToPath(all_hits_by_uri, hit); internal_uri = UriFu.UriToEscapedString(hit.ParentUri); hit.ParentUri = UriFu.PathToFileUri(path); hit.Uri = UriFu.AddFragment(UriFu.PathToFileUri(path), hit.Uri.Fragment, true); } else { internal_uri = UriFu.UriToEscapedString(hit.Uri); hit.Uri = UriFu.PathToFileUri(RemapUriToPath(all_hits_by_uri, hit)); hit.AddProperty(Property.NewUnsearched("beagrep:InternalUri", internal_uri)); } } } ArrayList matching_hits; if (uris.Count == 0) { matching_hits = all_hits; } else { matching_hits = new ArrayList(driver.GetHitsForUris(RemapUris(driver, uris))); if (index_name == "FileSystemIndex") { for (int i = 0; i < matching_hits.Count; i++) { Hit hit = (Hit)matching_hits [i]; Hit mapped_hit = (Hit)all_hits_by_uri [hit.Uri]; matching_hits [i] = mapped_hit; } } } matching_hits.Sort(new HitByUriComparer()); foreach (Hit hit in matching_hits) { if (!show_properties) { Console.WriteLine("{0}: {1}", index_name, hit.Uri); continue; } Console.WriteLine(" Index: {0}", index_name); Console.WriteLine(" Uri: {0}", hit.Uri); if (hit.ParentUri != null) { Console.WriteLine("Parent: {0}", hit.ParentUri); } Console.WriteLine(" MimeT: {0}", hit.MimeType); Console.WriteLine(" Type: {0}", hit.Type); Console.WriteLine("Source: {0}", hit.Source); ArrayList props; props = new ArrayList(hit.Properties); props.Sort(); foreach (Property prop in props) { char [] legend = new char [4]; legend [0] = prop.IsMutable ? 'm' : ' '; legend [1] = prop.IsSearched ? 's' : ' '; legend [2] = prop.IsPersistent ? 'p' : ' '; legend [3] = prop.Type == PropertyType.Text ? 't' : ' '; Console.WriteLine(" Prop: [{0}] {1} = '{2}'", new String(legend), prop.Key, prop.Value); } Console.WriteLine(); } return(matching_hits.Count); }
override protected void DoPullProperties() { Stopwatch watch = new Stopwatch(); watch.Start(); while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: if (reader.Name.StartsWith("sect") || reader.Name.StartsWith("chapter")) { string id = reader.GetAttribute("id"); if (id != null && id != String.Empty) { DocbookEntry entry = new DocbookEntry(); entry.Id = id; entry.Depth = reader.Depth; string language = reader.GetAttribute("lang"); if (language != null && language != String.Empty) { entry.Language = language; } entries_stack.Push(entry); } } else if (reader.Name == "article" || reader.Name == "book") { string language = reader.GetAttribute("lang"); if (language != null && language != String.Empty) { base_language = language; } } else if (reader.Name == "title") { reader.Read(); // Go to the text node if (entries_stack.Count == 0 && base_title == null) { // This is probably the book title base_title = reader.Value; } else if (entries_stack.Count > 0) { DocbookEntry entry = (DocbookEntry)entries_stack.Peek(); if (entry.Title == null) { entry.Title = reader.Value; } } } else if (reader.Name == "keyword") { reader.Read(); // read the text node AddProperty(Property.NewKeyword("dc:subject", reader.Value)); } break; case XmlNodeType.Text: // Append text to the child indexable if (entries_stack.Count > 0) { ((DocbookEntry)entries_stack.Peek()).Content.Append(reader.Value); } // Append text to the main indexable else { AppendWord(reader.Value); } break; case XmlNodeType.EndElement: if (entries_stack.Count > 0 && ((DocbookEntry)entries_stack.Peek()).Depth == reader.Depth) { DocbookEntry entry, parent_entry = null; entry = (DocbookEntry)entries_stack.Pop(); if (entries_stack.Count > 0) { parent_entry = (DocbookEntry)entries_stack.Peek(); } Indexable indexable; indexable = new Indexable(UriFu.AddFragment(Indexable.Uri, entry.Id, false)); indexable.HitType = "DocbookEntry"; indexable.MimeType = "text/x-docbook-entry"; indexable.AddProperty(Property.NewKeyword("beagle:FileType", "documentation")); indexable.Filtering = IndexableFiltering.AlreadyFiltered; indexable.AddProperty(Property.NewUnsearched("fixme:id", entry.Id)); indexable.AddProperty(Property.New("dc:title", entry.Title)); // Add the docbook book title indexable.AddProperty(Property.NewUnsearched("fixme:base_title", base_title)); // Add the child language (or docbook language if none is specified) if (entry.Language != null) { indexable.AddProperty(Property.NewUnsearched("fixme:language", entry.Language)); } else if (base_language != null) { indexable.AddProperty(Property.NewUnsearched("fixme:language", base_language)); } // Add any parent (as in docbook parent entry, not beagle) data if we have it if (parent_entry != null) { indexable.AddProperty(Property.NewUnsearched("fixme:parent_id", parent_entry.Id)); indexable.AddProperty(Property.NewUnsearched("fixme:parent_title", parent_entry.Title)); } StringReader content_reader = new StringReader(entry.Content.ToString()); indexable.SetTextReader(content_reader); indexable.SetChildOf(this.Indexable); AddIndexable(indexable); } break; } } // Add the common properties to the top-level // file item such as Title, Language etc. AddProperty(Property.New("dc:title", base_title)); AddProperty(Property.NewUnsearched("fixme:language", base_language)); watch.Stop(); // If we've successfully crawled the file but haven't // found any indexables, we shouldn't consider it // successfull at all (unless we have a title, which // means that it's actually a docbook file, just without // sections. if (!HasGeneratedIndexable && base_title == null) { Log.Error("Probably not a docbook. Ignoring {0}!", base_path); Error(); return; } Logger.Log.Debug("Parsed docbook file in {0}", watch); Finished(); }
public override bool GenerateNextIndexable(out Indexable child) { child = null; if (bib_process == null && !InitBibparse()) { return(false); } string line = null; string type = null, name = null; while ((line = reader.ReadLine()) != null) { if (line == String.Empty || line [0] != '@') { continue; } int i = line.IndexOf(' '); if (i == -1 || line.Length == i + 1) { continue; } type = line.Substring(1, i - 1).ToLower(); name = line.Substring(i + 1); break; } if (line == null) { return(false); } child = new Indexable(UriFu.AddFragment(Indexable.Uri, name, false)); child.CacheContent = false; child.MimeType = "text/x-bibtex"; child.DisplayUri = child.Uri; child.NoContent = true; child.AddProperty(Property.NewKeyword("bibtex:type", type)); string key, value; // Now fill in properties from the key=value lines while ((line = reader.ReadLine()) != null) { // Entries are separated by empty lines if (line == String.Empty) { break; } int i = line.IndexOf('='); // ensure non-empty key if (i < 1 || line.Length == i + i) { continue; } key = line.Substring(0, i).ToLower(); value = line.Substring(i + 1); foreach (Property prop in EntryLineToProperty(key, value)) { child.AddProperty(prop); } } child.SetChildOf(Indexable); return(true); }
public override bool GenerateNextIndexable(out Indexable child) { ArchiveEntry a_entry; child = null; if (!setup_done) { SetupArchiveStream(); } if (count >= MAX_CHILDREN) { Log.Debug("Archive {0} contains more than {1} files. Only {1} files indexed.", Indexable.DisplayUri.ToString(), count); Close(); return(false); } if (total_size > MAX_ALL_FILES) { Log.Debug("Archive {0} crossed our max uncompressed size threshold. Only {1} files extracted", Indexable.DisplayUri.ToString(), count); Close(); return(false); } a_entry = DoGetNextEntry(); if (a_entry == null) { Close(); return(false); } // Store file names in the archive AppendText(Path.GetFileName(a_entry.Name)); AppendWhiteSpace(); // If this is an invalid or oversized entry, skip it. if (a_entry.TempFile == null) { return(true); } ++count; total_size += a_entry.Size; // Add "#<escaped-path-to-entry>" to the end of the Indexable Uri // So, file b#c in archive foo.zip becomes file:///foo.zip#b%23c // And file c in archive b in archive foo.zip becomes file:///foo.zip#b#c child = new Indexable(UriFu.AddFragment(Indexable.Uri, a_entry.Name, false)); child.CacheContent = true; child.MimeType = a_entry.MimeType; child.DisplayUri = new Uri(Indexable.DisplayUri.ToString() + "#" + a_entry.Name); child.ContentUri = UriFu.PathToFileUri(a_entry.TempFile); child.DeleteContent = true; // FIXME Remove fixme:inside_archive during Property Hack Week // Replace most flag properties by value properties child.AddProperty(Property.NewBool("fixme:inside_archive", true)); // Use this instead of fixme:inside_archive child.AddProperty(Property.NewKeyword("archive:type", archive_type)); child.AddProperty(Property.NewKeyword("fixme:relativeuri", a_entry.Name)); child.AddProperty(Property.New("fixme:comment", a_entry.Comment)); child.AddProperty(Property.NewUnsearched("fixme:filesize", a_entry.Size)); foreach (Property prop in Property.StandardFileProperties(Path.GetFileName(a_entry.Name), false)) { child.AddProperty(prop); } child.SetChildOf(Indexable); return(true); }
public void OnEachPart(GMime.Object mime_part) { GMime.Object part = null; bool part_needs_dispose = false; //for (int i = 0; i < this.depth; i++) // Console.Write (" "); //Console.WriteLine ("Content-Type: {0}", mime_part.ContentType); ++depth; if (mime_part is GMime.MessagePart) { GMime.MessagePart msg_part = (GMime.MessagePart)mime_part; using (GMime.Message message = msg_part.Message) { using (GMime.Object subpart = message.MimePart) this.OnEachPart(subpart); } } else if (mime_part is GMime.Multipart) { GMime.Multipart multipart = (GMime.Multipart)mime_part; int num_parts = multipart.Count; // If the mimetype is multipart/alternative, we only want to index // one part -- the richest one we can filter. if (mime_part.ContentType.MediaSubtype.ToLower() == "alternative") { // The richest formats are at the end, so work from there // backward. for (int i = num_parts - 1; i >= 0; i--) { GMime.Object subpart = multipart[i]; if (IsMimeTypeHandled(subpart.ContentType.ToString())) { part = subpart; part_needs_dispose = true; break; } else { subpart.Dispose(); } } } // If it's not alternative, or we don't know how to filter any of // the parts, treat them like a bunch of attachments. if (part == null) { for (int i = 0; i < num_parts; i++) { using (GMime.Object subpart = multipart[i]) this.OnEachPart(subpart); } } } else if (mime_part is GMime.Part) { part = mime_part; } else { throw new Exception(String.Format("Unknown part type: {0}", part.GetType())); } if (part != null) { System.IO.Stream stream = null; using (GMime.DataWrapper content_obj = ((GMime.Part)part).ContentObject) stream = content_obj.Stream; // If this is the only part and it's plain text, we // want to just attach it to our filter instead of // creating a child indexable for it. bool no_child_needed = false; string mime_type = part.ContentType.ToString().ToLower(); if (this.depth == 1 && this.count == 0) { if (mime_type == "text/plain") { no_child_needed = true; this.reader = new StreamReader(stream); } else if (mime_type == "text/html") { no_child_needed = true; html_part = true; string enc = part.ContentType.GetParameter("charset"); // DataWrapper.Stream is a very limited stream // and does not allow Seek or Tell // HtmlFilter requires Stream.Position=0. // Play safe and create a memorystream // for HTML parsing. GMime.StreamMem mem_stream; mem_stream = new GMime.StreamMem(); GMime.Stream data_stream; data_stream = ((StreamWrapper)stream).GMimeStream; data_stream.WriteToStream(mem_stream); data_stream.Flush(); // The StreamWrapper and hence the memory_stream // will be closed when the reader is closed // after Pull()-ing is done. System.IO.Stream html_stream; html_stream = new StreamWrapper(mem_stream); html_stream.Seek(0, SeekOrigin.Begin); stream.Close(); try { this.reader = FilterHtml.GetHtmlReader(html_stream, enc, link_handler); } catch (Exception e) { Log.Debug(e, "Exception while filtering HTML email {0}", this.indexable.Uri); this.reader = null; html_stream.Close(); html_part = false; } } } if (!no_child_needed) { // Check the mime type against the blacklist and don't index any // parts that are contained within. That way the user doesn't // get flooded with pointless signatures and vcard and ical // attachments along with (real) attachments. if (Array.IndexOf(blacklisted_mime_types, mime_type) == -1) { string sub_uri = "#" + this.count; Indexable child; child = new Indexable(UriFu.AddFragment(this.indexable.Uri, sub_uri, true)); child.DisplayUri = new Uri(this.indexable.DisplayUri.ToString() + "#" + this.count); // This is a special case. // Even for mails found on disk, MailMessage hitype is set child.HitType = "MailMessage"; child.MimeType = mime_type; // If this is the richest part we found for multipart emails, add its content to textcache if (snippet_attachment || (this.depth == 1 && this.count == 0)) { child.CacheContent = true; } else { child.CacheContent = false; } string filename = ((GMime.Part)part).Filename; if (!String.IsNullOrEmpty(filename)) { child.AddProperty(Property.NewKeyword("fixme:attachment_title", filename)); foreach (Property prop in Property.StandardFileProperties(filename, false)) { child.AddProperty(prop); } } // Store length of attachment long length = stream.Length; if (length != -1) { child.AddProperty(Property.NewUnsearched("fixme:filesize", length)); } if (part.ContentType.MediaType.ToLower() == "text") { child.SetTextReader(new StreamReader(stream)); } else { child.SetBinaryStream(stream); } child.SetChildOf(this.indexable); child.StoreStream(); child.CloseStreams(); this.child_indexables.Add(child); } else { Log.Debug("Skipping attachment {0}#{1} with blacklisted mime type {2}", this.indexable.Uri, this.count, mime_type); } } this.count++; } if (part_needs_dispose) { part.Dispose(); } --depth; }