Crawler/Anemone: Dirbusting now optional
[FIXRM #8030] Anemone updated to make dirbusting optional (on by default) and the Crawler core module updated to provide an option to do so.
This commit is contained in:
+2
-1
@@ -53,7 +53,8 @@ module Anemone
|
||||
# accept cookies from the server and send them back?
|
||||
:accept_cookies => false,
|
||||
# skip any link with a query string? e.g. http://foo.com/?u=user
|
||||
:skip_query_strings => false
|
||||
:skip_query_strings => false,
|
||||
:dirbust => true
|
||||
}
|
||||
|
||||
# Create setter methods for all options to be called from the crawl block
|
||||
|
||||
+9
-1
@@ -55,6 +55,7 @@ module Anemone
|
||||
@url = url
|
||||
@data = OpenStruct.new
|
||||
|
||||
@dirbust = params[:dirbust]
|
||||
@code = params[:code]
|
||||
@headers = params[:headers] || {}
|
||||
@headers['content-type'] ||= ['']
|
||||
@@ -83,7 +84,10 @@ module Anemone
|
||||
|
||||
def run_extractors
|
||||
return [] if !doc
|
||||
self.class.extractors.map { |e| e.new( self ).run rescue next }.flatten.
|
||||
self.class.extractors.map do |e|
|
||||
next if e == Extractors::Dirbuster && !dirbust?
|
||||
e.new( self ).run rescue next
|
||||
end.flatten.
|
||||
compact.map do |p|
|
||||
abs = to_absolute( URI( p ) ) rescue next
|
||||
!in_domain?( abs ) ? nil : abs
|
||||
@@ -181,6 +185,10 @@ module Anemone
|
||||
return absolute
|
||||
end
|
||||
|
||||
def dirbust?
|
||||
@dirbust
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if *uri* is in the same domain as the page, returns
|
||||
# +false+ otherwise
|
||||
|
||||
@@ -46,7 +46,9 @@ module Anemone
|
||||
:referer => referer,
|
||||
:depth => depth,
|
||||
:redirect_to => redirect_to,
|
||||
:response_time => response_time)
|
||||
:response_time => response_time,
|
||||
:dirbust => @opts[:dirbust]
|
||||
)
|
||||
# Store the associated raw HTTP request
|
||||
page.request = response.request
|
||||
pages << page
|
||||
|
||||
@@ -20,6 +20,7 @@ module Auxiliary::HttpCrawler
|
||||
OptString.new('VHOST', [ false, "HTTP server virtual host" ]),
|
||||
OptString.new('URI', [ true, "The starting page to crawl", "/"]),
|
||||
Opt::Proxies,
|
||||
OptBool.new('RUN_DIRBUSTER', [ false, 'The maximum number of pages to crawl per URL', true]),
|
||||
OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]),
|
||||
OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]),
|
||||
OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4]),
|
||||
@@ -173,6 +174,10 @@ module Auxiliary::HttpCrawler
|
||||
datastore['MAX_THREADS']
|
||||
end
|
||||
|
||||
def dirbust?
|
||||
datastore['RUN_DIRBUSTER']
|
||||
end
|
||||
|
||||
# Scrub links that end in these extensions. If more or less is
|
||||
# desired by a particular module, this should get redefined.
|
||||
def get_link_filter
|
||||
@@ -275,6 +280,7 @@ module Auxiliary::HttpCrawler
|
||||
opts[:framework] = framework
|
||||
opts[:module] = self
|
||||
opts[:timeout] = get_connection_timeout
|
||||
opts[:dirbust] = dirbust?
|
||||
|
||||
if (t[:headers] and t[:headers].length > 0)
|
||||
opts[:inject_headers] = t[:headers]
|
||||
|
||||
Reference in New Issue
Block a user