require 'rubygems'
require 'pathname'
require 'hpricot'
require 'uri'

class CrawlerSimple < BaseParser

	def parse(request,result)
		
		if !result['Content-Type'].include? "text/html"
			return
		end
		
		doc = Hpricot(result.body.to_s)
		doc.search('a').each do |link|
		
		hr = link.attributes['href']
		
		if hr and !hr.match(/^(\#|javascript\:)/) 
			begin
				uri = URI.parse(hr)
			
				tssl = false
				if uri.scheme == "https"
					tssl = true
				else
					tssl = false
				end

				if !uri.host or uri.host == nil
					thost = request['rhost']
					tssl = self.targetssl	
				else
					thost = uri.host	
				end

				if !uri.port or uri.port == nil
					tport = request['rport']
				else
					tport = uri.port
				end

				if !uri.path or uri.path == nil
					tpath = "/"
				else
					tpath = uri.path
				end
				
				
				
				newp = Pathname.new(tpath)
				oldp = Pathname.new(request['uri'])
				if !newp.absolute?
					if oldp.to_s[-1,1] == '/'
						newp = oldp+newp
					else
						if !newp.to_s.empty?
							newp = File.join(oldp.dirname,newp)
						end
					end		
				end

				hreq = {
					'rhost'		=> thost,
					'rport'		=> tport,
					'uri'  		=> newp.to_s,
					'method'   	=> 'GET',
					'ctype'		=> 'text/plain',
					'ssl'		=> tssl,
					'query'		=> uri.query,
					'data'		=> nil
					
				}

				insertnewpath(hreq)
					
			rescue URI::InvalidURIError
				#puts "Parse error"
				#puts "Error: #{link[0]}"
			end
		end
		end
	end 
end