Crawler/Anemone: Dirbusting now optional

[FIXRM #8030] Anemone updated to make dirbusting optional (on by default) and the Crawler core module updated to provide an option to do so.
2013-06-13 00:00:09 +03:00
parent ff8afc1490
commit b474cda4aa
4 changed files with 20 additions and 3 deletions
@@ -53,7 +53,8 @@ module Anemone
      # accept cookies from the server and send them back?
      :accept_cookies => false,
      # skip any link with a query string? e.g. http://foo.com/?u=user
-      :skip_query_strings => false
+      :skip_query_strings => false,
+      :dirbust  => true
    }

    # Create setter methods for all options to be called from the crawl block
@@ -55,6 +55,7 @@ module Anemone
      @url = url
      @data = OpenStruct.new

+      @dirbust = params[:dirbust]
      @code = params[:code]
      @headers = params[:headers] || {}
      @headers['content-type'] ||= ['']
@@ -83,7 +84,10 @@ module Anemone

    def run_extractors
      return [] if !doc
-      self.class.extractors.map { |e| e.new( self ).run rescue next }.flatten.
+      self.class.extractors.map do |e|
+	      next if e == Extractors::Dirbuster && !dirbust?
+	      e.new( self ).run rescue next
+      end.flatten.
          compact.map do |p|
              abs = to_absolute( URI( p ) ) rescue next
              !in_domain?( abs ) ? nil : abs
@@ -181,6 +185,10 @@ module Anemone
      return absolute
    end

+    def dirbust?
+	    @dirbust
+    end
+
    #
    # Returns +true+ if *uri* is in the same domain as the page, returns
    # +false+ otherwise
@@ -46,7 +46,9 @@ module Anemone
                                      :referer => referer,
                                      :depth => depth,
                                      :redirect_to => redirect_to,
-                                      :response_time => response_time)
+                                      :response_time => response_time,
+                                      :dirbust => @opts[:dirbust]
+          )
          # Store the associated raw HTTP request
          page.request = response.request
 		  pages << page
@@ -20,6 +20,7 @@ module Auxiliary::HttpCrawler
 				OptString.new('VHOST', [ false, "HTTP server virtual host" ]),
 				OptString.new('URI',   [ true, "The starting page to crawl", "/"]),
 				Opt::Proxies,
+				OptBool.new('RUN_DIRBUSTER', [ false, 'The maximum number of pages to crawl per URL', true]),
 				OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]),
 				OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]),
 				OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4]),
@@ -173,6 +174,10 @@ module Auxiliary::HttpCrawler
 		datastore['MAX_THREADS']
 	end

+	def dirbust?
+		datastore['RUN_DIRBUSTER']
+	end
+
 	# Scrub links that end in these extensions. If more or less is
 	# desired by a particular module, this should get redefined.
 	def get_link_filter
@@ -275,6 +280,7 @@ module Auxiliary::HttpCrawler
 		opts[:framework]           = framework
 		opts[:module]              = self
 		opts[:timeout]             = get_connection_timeout
+		opts[:dirbust]             = dirbust?

 		if (t[:headers] and t[:headers].length > 0)
 			opts[:inject_headers] = t[:headers]