The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems' require 'mechanize' require 'logger' agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") } agent.user_agent_alias = 'Mac Safari' page = agent.get("http://www.google.com/") search_form = page.forms.name("f").first search_form.fields.name("q").value = "Hello" search_results = agent.submit(search_form) puts search_results.body
VERSION | = | '0.7.6' | The version of Mechanize you are using. | |
AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases | |
CNONCE | = | Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535))) |
redirect_ok | -> | follow_redirect? |
ca_file | [RW] | |
cert | [RW] | |
conditional_requests | [RW] | |
cookie_jar | [RW] | |
follow_meta_refresh | [RW] | |
history | [R] | |
history_added | [RW] | |
keep_alive | [RW] | |
keep_alive_time | [RW] | |
key | [RW] | |
log | [RW] | |
open_timeout | [RW] | |
pass | [RW] | |
pluggable_parser | [R] | |
read_timeout | [RW] | |
redirect_ok | [RW] | |
scheme_handlers | [RW] | |
user_agent | [RW] | |
verify_callback | [RW] | |
watch_for_set | [RW] |
# File lib/www/mechanize.rb, line 310 310: def html_unescape(s) 311: return s unless s 312: s.gsub(/&(\w+|#[0-9]+);/) { |match| 313: number = case match 314: when /&(\w+);/ 315: Hpricot::NamedCharacters[$1] 316: when /&#([0-9]+);/ 317: $1.to_i 318: end 319: 320: number ? ([number].pack('U') rescue match) : match 321: } 322: end
# File lib/www/mechanize.rb, line 85 85: def initialize 86: # attr_accessors 87: @cookie_jar = CookieJar.new 88: @log = nil 89: @open_timeout = nil 90: @read_timeout = nil 91: @user_agent = AGENT_ALIASES['Mechanize'] 92: @watch_for_set = nil 93: @history_added = nil 94: @ca_file = nil # OpenSSL server certificate file 95: 96: # callback for OpenSSL errors while verifying the server certificate 97: # chain, can be used for debugging or to ignore errors by always 98: # returning _true_ 99: @verify_callback = nil 100: @cert = nil # OpenSSL Certificate 101: @key = nil # OpenSSL Private Key 102: @pass = nil # OpenSSL Password 103: @redirect_ok = true # Should we follow redirects? 104: 105: # attr_readers 106: @history = WWW::Mechanize::History.new 107: @pluggable_parser = PluggableParser.new 108: 109: # Auth variables 110: @user = nil # Auth User 111: @password = nil # Auth Password 112: @digest = nil # DigestAuth Digest 113: @auth_hash = {} # Keep track of urls for sending auth 114: @digest_response = nil 115: 116: # Proxy settings 117: @proxy_addr = nil 118: @proxy_pass = nil 119: @proxy_port = nil 120: @proxy_user = nil 121: 122: @conditional_requests = true 123: 124: @follow_meta_refresh = false 125: 126: # Connection Cache & Keep alive 127: @connection_cache = {} 128: @keep_alive_time = 300 129: @keep_alive = true 130: 131: @scheme_handlers = Hash.new { |h,k| 132: h[k] = lambda { |link, page| 133: raise UnsupportedSchemeError.new(k) 134: } 135: } 136: @scheme_handlers['http'] = lambda { |link, page| link } 137: @scheme_handlers['https'] = @scheme_handlers['http'] 138: @scheme_handlers['relative'] = @scheme_handlers['http'] 139: 140: yield self if block_given? 141: end
# File lib/www/mechanize.rb, line 683 683: def self.build_query_string(parameters) 684: parameters.map { |k,v| 685: k && 686: [WEBrick::HTTPUtils.escape_form(k.to_s), 687: WEBrick::HTTPUtils.escape_form(v.to_s)].join("=") 688: }.compact.join('&') 689: end
# File lib/www/mechanize.rb, line 167 167: def auth(user, password) 168: @user = user 169: @password = password 170: end
Sets the user and password to be used for basic authentication.
# File lib/www/mechanize.rb, line 163 163: def basic_auth(user, password) 164: auth(user, password) 165: end
Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.
# File lib/www/mechanize.rb, line 213 213: def click(link) 214: referer = 215: begin 216: link.page 217: rescue 218: nil 219: end 220: uri = to_absolute_uri( 221: link.attributes['href'] || link.attributes['src'] || link.href, 222: referer || current_page() 223: ) 224: get(uri, referer) 225: end
Fetches the URL passed in and returns a page.
# File lib/www/mechanize.rb, line 173 173: def get(url, parameters = [], referer = nil) 174: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0 175: referer = parameters 176: parameters = [] 177: end 178: 179: referer ||= current_page || Page.new(nil, {'content-type'=>'text/html'}) 180: 181: # FIXME: Huge hack so that using a URI as a referer works. I need to 182: # refactor everything to pass around URIs but still support 183: # WWW::Mechanize::Page#base 184: unless referer.is_a?(WWW::Mechanize::File) 185: referer = referer.is_a?(String) ? 186: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) : 187: Page.new(referer, {'content-type' => 'text/html'}) 188: end 189: abs_uri = to_absolute_uri(url, referer) 190: 191: if parameters.length > 0 192: abs_uri.query ||= '' 193: abs_uri.query << '&' if abs_uri.query.length > 0 194: abs_uri.query << self.class.build_query_string(parameters) 195: end 196: 197: # fetch the page 198: request = fetch_request(abs_uri) 199: page = fetch_page(abs_uri, request, referer) 200: add_to_history(page) 201: yield page if block_given? 202: page 203: end
Fetch a file and return the contents of the file.
# File lib/www/mechanize.rb, line 206 206: def get_file(url) 207: get(url).body 208: end
# File lib/www/mechanize.rb, line 143 143: def max_history=(length); @history.max_size = length; end
Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:
agent.post('http://example.com/', "foo" => "bar")
or
agent.post('http://example.com/', [ ["foo", "bar"] ])
# File lib/www/mechanize.rb, line 239 239: def post(url, query={}) 240: node = Hpricot::Elem.new(Hpricot::STag.new('form')) 241: node['method'] = 'POST' 242: node['enctype'] = 'application/x-www-form-urlencoded' 243: 244: form = Form.new(node) 245: query.each { |k,v| 246: if v.is_a?(IO) 247: form.enctype = 'multipart/form-data' 248: ul = Form::FileUpload.new(k.to_s,::File.basename(v.path)) 249: ul.file_data = v.read 250: form.file_uploads << ul 251: else 252: form.fields << Form::Field.new(k.to_s,v) 253: end 254: } 255: post_form(url, form) 256: end
Sets the proxy address, port, user, and password
# File lib/www/mechanize.rb, line 147 147: def set_proxy(addr, port, user = nil, pass = nil) 148: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass 149: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com') agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/www/mechanize.rb, line 264 264: def submit(form, button=nil) 265: form.add_button_to_query(button) if button 266: uri = to_absolute_uri(form.action, form.page) 267: case form.method.upcase 268: when 'POST' 269: post_form(uri, form) 270: when 'GET' 271: uri.query = WWW::Mechanize.build_query_string(form.build_query) 272: get(uri) 273: else 274: raise "unsupported method: #{form.method.upcase}" 275: end 276: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/www/mechanize.rb, line 298 298: def transact 299: history_backup = @history.dup 300: begin 301: yield self 302: ensure 303: @history = history_backup 304: end 305: end
Returns whether or not a url has been visited
# File lib/www/mechanize.rb, line 284 284: def visited?(url) 285: ! visited_page(url).nil? 286: end
# File lib/www/mechanize.rb, line 376 376: def gen_auth_header(uri, request, auth_header, is_IIS = false) 377: @@nonce_count += 1 378: 379: user = @digest_user 380: password = @digest_password 381: 382: auth_header =~ /^(\w+) (.*)/ 383: 384: params = {} 385: $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 } 386: 387: a_1 = "#{@user}:#{params['realm']}:#{@password}" 388: a_2 = "#{request.method}:#{uri.path}" 389: request_digest = '' 390: request_digest << Digest::MD5.hexdigest(a_1) 391: request_digest << ':' << params['nonce'] 392: request_digest << ':' << ('%08x' % @@nonce_count) 393: request_digest << ':' << CNONCE 394: request_digest << ':' << params['qop'] 395: request_digest << ':' << Digest::MD5.hexdigest(a_2) 396: 397: header = '' 398: header << "Digest username=\"#{@user}\", " 399: header << "realm=\"#{params['realm']}\", " 400: if is_IIS then 401: header << "qop=\"#{params['qop']}\", " 402: else 403: header << "qop=#{params['qop']}, " 404: end 405: header << "uri=\"#{uri.path}\", " 406: header << "algorithm=MD5, " 407: header << "nonce=\"#{params['nonce']}\", " 408: header << "nc=#{'%08x' % @@nonce_count}, " 409: header << "cnonce=\"#{CNONCE}\", " 410: header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\"" 411: 412: return header 413: end
# File lib/www/mechanize.rb, line 326 326: def set_headers(uri, request, cur_page) 327: if @keep_alive 328: request.add_field('Connection', 'keep-alive') 329: request.add_field('Keep-Alive', keep_alive_time.to_s) 330: else 331: request.add_field('Connection', 'close') 332: end 333: request.add_field('Accept-Encoding', 'gzip,identity') 334: request.add_field('Accept-Language', 'en-us,en;q=0.5') 335: request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7') 336: 337: unless @cookie_jar.empty?(uri) 338: cookies = @cookie_jar.cookies(uri) 339: cookie = cookies.length > 0 ? cookies.join("; ") : nil 340: if log 341: cookies.each do |c| 342: log.debug("using cookie: #{c}") 343: end 344: end 345: request.add_field('Cookie', cookie) 346: end 347: 348: # Add Referer header to request 349: unless cur_page.uri.nil? 350: request.add_field('Referer', cur_page.uri.to_s) 351: end 352: 353: # Add User-Agent header to request 354: request.add_field('User-Agent', @user_agent) if @user_agent 355: 356: # Add If-Modified-Since if page is in history 357: if @conditional_requests 358: if( (page = visited_page(uri)) && page.response['Last-Modified'] ) 359: request.add_field('If-Modified-Since', page.response['Last-Modified']) 360: end 361: end 362: 363: if( @auth_hash[uri.host] ) 364: case @auth_hash[uri.host] 365: when :basic 366: request.basic_auth(@user, @password) 367: when :digest 368: @digest_response = self.gen_auth_header(uri,request,@digest) if @digest 369: request.add_field('Authorization', @digest_response) if @digest_response 370: end 371: end 372: 373: request 374: end
# File lib/www/mechanize.rb, line 691 691: def add_to_history(page) 692: @history.push(page, to_absolute_uri(page.uri)) 693: history_added.call(page) if history_added 694: end
uri is an absolute URI
# File lib/www/mechanize.rb, line 482 482: def fetch_page(uri, request, cur_page=current_page(), request_data=[]) 483: raise "unsupported scheme: #{uri.scheme}" unless ['http', 'https'].include?(uri.scheme.downcase) 484: 485: log.info("#{ request.class }: #{ request.path }") if log 486: 487: page = nil 488: 489: cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= { 490: :connection => nil, 491: :keep_alive_options => {}, 492: }) 493: http_obj = cache_obj[:connection] 494: if http_obj.nil? || ! http_obj.started? 495: http_obj = cache_obj[:connection] = 496: Net::HTTP.new( uri.host, 497: uri.port, 498: @proxy_addr, 499: @proxy_port, 500: @proxy_user, 501: @proxy_pass 502: ) 503: cache_obj[:keep_alive_options] = {} 504: 505: # Specify timeouts if given 506: http_obj.open_timeout = @open_timeout if @open_timeout 507: http_obj.read_timeout = @read_timeout if @read_timeout 508: end 509: 510: if uri.scheme == 'https' && ! http_obj.started? 511: http_obj.use_ssl = true 512: http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE 513: if @ca_file 514: http_obj.ca_file = @ca_file 515: http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER 516: http_obj.verify_callback = @verify_callback if @verify_callback 517: end 518: if @cert && @key 519: http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert)) 520: http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass) 521: end 522: end 523: 524: # If we're keeping connections alive and the last request time is too 525: # long ago, stop the connection. Or, if the max requests left is 1, 526: # reset the connection. 527: if @keep_alive && http_obj.started? 528: opts = cache_obj[:keep_alive_options] 529: if((opts[:timeout] && 530: Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) || 531: opts[:max] && opts[:max].to_i == 1) 532: 533: log.debug('Finishing stale connection') if log 534: http_obj.finish 535: 536: end 537: end 538: 539: http_obj.start unless http_obj.started? 540: 541: request = set_headers(uri, request, cur_page) 542: 543: # Log specified headers for the request 544: if log 545: request.each_header do |k, v| 546: log.debug("request-header: #{ k } => #{ v }") 547: end 548: end 549: 550: cache_obj[:last_request_time] = Time.now.to_i 551: 552: # Send the request 553: begin 554: response = http_obj.request(request, *request_data) {|response| 555: 556: body = StringIO.new 557: total = 0 558: response.read_body { |part| 559: total += part.length 560: body.write(part) 561: log.debug("Read #{total} bytes") if log 562: } 563: # Net::HTTP ignores EOFError if Content-length is given, so we emulate it here. 564: raise EOFError if response.content_length() && response.content_length() != total 565: body.rewind 566: 567: response.each_header { |k,v| 568: log.debug("response-header: #{ k } => #{ v }") 569: } if log 570: 571: content_type = nil 572: unless response['Content-Type'].nil? 573: data = response['Content-Type'].match(/^([^;]*)/) 574: content_type = data[1].downcase unless data.nil? 575: end 576: 577: response_body = 578: if encoding = response['Content-Encoding'] 579: case encoding.downcase 580: when 'gzip' 581: log.debug('gunzip body') if log 582: if response['Content-Length'].to_i > 0 || body.length > 0 583: begin 584: Zlib::GzipReader.new(body).read 585: rescue Zlib::BufError => e 586: log.error('Caught a Zlib::BufError') if log 587: body.rewind 588: body.read(10) 589: Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.read) 590: end 591: else 592: '' 593: end 594: when 'x-gzip' 595: body.read 596: else 597: raise 'Unsupported content encoding' 598: end 599: else 600: body.read 601: end 602: 603: # Find our pluggable parser 604: page = @pluggable_parser.parser(content_type).new( 605: uri, 606: response, 607: response_body, 608: response.code 609: ) { |parser| 610: parser.mech = self if parser.respond_to? :mech= 611: if parser.respond_to?(:watch_for_set=) && @watch_for_set 612: parser.watch_for_set = @watch_for_set 613: end 614: } 615: 616: } 617: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE 618: log.error("Rescuing EOF error") if log 619: http_obj.finish 620: request.body = nil 621: http_obj.start 622: retry 623: end 624: 625: # If the server sends back keep alive options, save them 626: if keep_alive_info = response['keep-alive'] 627: keep_alive_info.split(/,\s*/).each do |option| 628: k, v = option.split(/=/) 629: cache_obj[:keep_alive_options] ||= {} 630: cache_obj[:keep_alive_options][k.intern] = v 631: end 632: end 633: 634: (response.get_fields('Set-Cookie')||[]).each do |cookie| 635: Cookie::parse(uri, cookie, log) { |c| 636: log.debug("saved cookie: #{c}") if log 637: @cookie_jar.add(uri, c) 638: } 639: end 640: 641: log.info("status: #{ page.code }") if log 642: 643: res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s] 644: 645: if follow_meta_refresh && page.respond_to?(:meta) && 646: (redirect = page.meta.first) 647: return redirect.click 648: end 649: 650: return page if res_klass <= Net::HTTPSuccess 651: 652: if res_klass == Net::HTTPNotModified 653: log.debug("Got cached page") if log 654: return visited_page(uri) 655: elsif res_klass <= Net::HTTPRedirection 656: return page unless follow_redirect? 657: log.info("follow redirect to: #{ response['Location'] }") if log 658: from_uri = page.uri 659: abs_uri = to_absolute_uri(response['Location'].to_s, page) 660: page = fetch_page(abs_uri, fetch_request(abs_uri), page) 661: @history.push(page, from_uri) 662: return page 663: elsif res_klass <= Net::HTTPUnauthorized 664: raise ResponseCodeError.new(page) unless @user || @password 665: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host) 666: if response['www-authenticate'] =~ /Digest/i 667: @auth_hash[uri.host] = :digest 668: @digest = response['www-authenticate'] 669: else 670: @auth_hash[uri.host] = :basic 671: end 672: # Copy the request headers for the second attempt 673: req = fetch_request(uri, request.method.downcase.to_sym) 674: request.each_header do |k,v| 675: req[k] = v 676: end 677: return fetch_page(uri, req, cur_page, request_data) 678: end 679: 680: raise ResponseCodeError.new(page), "Unhandled response", caller 681: end
Creates a new request object based on the scheme and type
# File lib/www/mechanize.rb, line 472 472: def fetch_request(uri, type = :get) 473: raise "unsupported scheme: #{uri.scheme}" unless ['http', 'https'].include?(uri.scheme.downcase) 474: if type == :get 475: Net::HTTP::Get.new(uri.request_uri) 476: else 477: Net::HTTP::Post.new(uri.request_uri) 478: end 479: end
# File lib/www/mechanize.rb, line 452 452: def post_form(url, form) 453: cur_page = form.page || current_page || 454: Page.new( nil, {'content-type'=>'text/html'}) 455: 456: request_data = form.request_data 457: 458: abs_url = to_absolute_uri(url, cur_page) 459: request = fetch_request(abs_url, :post) 460: request.add_field('Content-Type', form.enctype) 461: request.add_field('Content-Length', request_data.size.to_s) 462: 463: log.debug("query: #{ request_data.inspect }") if log 464: 465: # fetch the page 466: page = fetch_page(abs_url, request, cur_page, [request_data]) 467: add_to_history(page) 468: page 469: end
# File lib/www/mechanize.rb, line 417 417: def to_absolute_uri(url, cur_page=current_page()) 418: unless url.is_a? URI 419: url = url.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/) { |match| 420: sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0]) 421: } 422: 423: url = URI.parse( 424: Mechanize.html_unescape( 425: url.split(/%[0-9A-Fa-f]{2}|#/).zip( 426: url.scan(/%[0-9A-Fa-f]{2}|#/) 427: ).map { |x,y| 428: "#{URI.escape(x)}#{y}" 429: }.join('') 430: ) 431: ) 432: end 433: 434: url = @scheme_handlers[url.relative? ? 'relative' : url.scheme.downcase].call(url, cur_page) 435: url.path = '/' if url.path.length == 0 436: 437: # construct an absolute uri 438: if url.relative? 439: raise 'no history. please specify an absolute URL' unless cur_page.uri 440: base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil 441: url = ((base && base.uri && base.uri.absolute?) ? 442: base.uri : 443: cur_page.uri) + url 444: url = cur_page.uri + url 445: # Strip initial "/.." bits from the path 446: url.path.sub!(/^(\/\.\.)+(?=\/)/, '') 447: end 448: 449: return url 450: end