Crawler/Anemone: Dirbusting now optional

[FIXRM #8030]

Anemone updated to make dirbusting optional (on by default) and the Crawler core
module updated to provide an option to do so.
This commit is contained in:
Tasos Laskos
2013-06-13 00:00:09 +03:00
parent ff8afc1490
commit b474cda4aa
4 changed files with 20 additions and 3 deletions
+2 -1
View File
@@ -53,7 +53,8 @@ module Anemone
# accept cookies from the server and send them back?
:accept_cookies => false,
# skip any link with a query string? e.g. http://foo.com/?u=user
:skip_query_strings => false
:skip_query_strings => false,
:dirbust => true
}
# Create setter methods for all options to be called from the crawl block
+9 -1
View File
@@ -55,6 +55,7 @@ module Anemone
@url = url
@data = OpenStruct.new
@dirbust = params[:dirbust]
@code = params[:code]
@headers = params[:headers] || {}
@headers['content-type'] ||= ['']
@@ -83,7 +84,10 @@ module Anemone
def run_extractors
return [] if !doc
self.class.extractors.map { |e| e.new( self ).run rescue next }.flatten.
self.class.extractors.map do |e|
next if e == Extractors::Dirbuster && !dirbust?
e.new( self ).run rescue next
end.flatten.
compact.map do |p|
abs = to_absolute( URI( p ) ) rescue next
!in_domain?( abs ) ? nil : abs
@@ -181,6 +185,10 @@ module Anemone
return absolute
end
def dirbust?
@dirbust
end
#
# Returns +true+ if *uri* is in the same domain as the page, returns
# +false+ otherwise
+3 -1
View File
@@ -46,7 +46,9 @@ module Anemone
:referer => referer,
:depth => depth,
:redirect_to => redirect_to,
:response_time => response_time)
:response_time => response_time,
:dirbust => @opts[:dirbust]
)
# Store the associated raw HTTP request
page.request = response.request
pages << page
+6
View File
@@ -20,6 +20,7 @@ module Auxiliary::HttpCrawler
OptString.new('VHOST', [ false, "HTTP server virtual host" ]),
OptString.new('URI', [ true, "The starting page to crawl", "/"]),
Opt::Proxies,
OptBool.new('RUN_DIRBUSTER', [ false, 'The maximum number of pages to crawl per URL', true]),
OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]),
OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]),
OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4]),
@@ -173,6 +174,10 @@ module Auxiliary::HttpCrawler
datastore['MAX_THREADS']
end
def dirbust?
datastore['RUN_DIRBUSTER']
end
# Scrub links that end in these extensions. If more or less is
# desired by a particular module, this should get redefined.
def get_link_filter
@@ -275,6 +280,7 @@ module Auxiliary::HttpCrawler
opts[:framework] = framework
opts[:module] = self
opts[:timeout] = get_connection_timeout
opts[:dirbust] = dirbust?
if (t[:headers] and t[:headers].length > 0)
opts[:inject_headers] = t[:headers]