#!/usr/bin/env python # By: Raymond Myers License: GPL 3 # Import everything we need from the core libraries: import random, sys, urllib, HTMLParser, re ################################################## # CONFIGURATION CONSTANTS: # This url represents a search for all items between 0 and 5 USD that have both # free shipping, and a "buy it now" option, and that have more than an hour left # to go... (So that we don't get in many races..." START_URL = "http://shop.ebay.com/i.html?_nkw=a+b+c+d+e+f+g+h+i+j+k+l+m+n+o+p"+\ "+q+r+s+t+u+v+w+x+y+z&_in_kw=2&_ex_kw=&_sacat=See-All-Categories&"+\ "_okw=a+b+c+d+e+f+g+h+i+j+k+l+m+n+o+p+q+r+s+t+u+v+w+x+y+z&_oexkw="+\ "&_mPrRngCbx=1&_udlo=0&_udhi=5&LH_BIN=1&LH_IncludeSIF=1&_LH_Time="+\ "1&_ftrt=902&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&LH_FS=1"+\ "&_sadis=200&_fpos=Zip+code&_fsct=&LH_SALE_CURRENCY=0&_fss=1&_sas"+\ "lop=1&_sasl=&fsradio=%26LH_TopRatedSellers%3D1&_sop=1&_dmd=1&_ip"+\ "g=200" # These are all of the possible user agents... just to obfuscate the fact that # this is, indeed, a script. Long ones are broken up, cause they're long: AGENTS = \ ["Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) " + \ "Gecko/20070725 Firefox/2.0.0.6",\ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",\ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; " + \ ".NET CLR 2.0.50727; .NET CLR 3.0.04506.30)",\ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",\ "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322)"] ################################################## # CLASS DEFINITIONS: # Override the urllib user-agent with a random one: class WebNabber(urllib.FancyURLopener): version = random.choice(AGENTS) urllib._urlopener = WebNabber() # Override the HTML parser: class EbayParser(HTMLParser.HTMLParser): def __init__(self): self.output = [] self.in_entry, self.in_name, self.in_price = False, False, False self.eurl, self.ename, self.eprice = "","","" HTMLParser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if self.in_entry: # Is this the link with the name? if tag == "a" and ('class', 'v4lnk') in attrs: # Yes, so note that data should be the name: self.in_name = True # Now, we get the link url: for i in attrs: if i[0] == "href": self.eurl = i[1] # Is this the price div? elif tag == "div" and ('class', 'g-b') in attrs: self.in_price = True else: # Is this a table entry for an item in the search results: if tag == "table" and ('class', 'lview nol') in attrs: self.in_entry = True self.eurl, self.ename, self.eprice = "","","" def handle_endtag(self, tag): if self.in_entry: # Is this the end of the name link? if tag == "a" and self.in_name: self.in_name = False # Is this the end of the price div? elif tag == "div" and self.in_price: self.in_price = False # Are we done with our table? elif tag == "table": # Yes, so append the current parse results to the output list: self.output += [ (self.eurl, self.ename, self.eprice) ] self.in_entry = False def handle_data(self, data): if self.in_name: self.ename += data elif self.in_price: self.eprice += data ################################################## # FUNCTION DEFINITIONS: # This will return the HTML code that makes up a page: def get_html(url): try: conn = urllib.urlopen(url) html = conn.read() conn.close() except: raise Exception("Failed to get page.") else: return html # This will take the HTML data from ebay, and return a nice list: def parse_ebay(html): # Filter out irregular stuff, like Javascript and CSS: script_regex = re.compile("", re.IGNORECASE | re.DOTALL) style_regex = re.compile("", re.IGNORECASE | re.DOTALL) html = script_regex.sub("", html) html = style_regex.sub("", html) # Parse the HTML document p = EbayParser() p.feed(html) p.close() # Return the cool stuff: return p.output if __name__=="__main__": # Grab the page: html = get_html(START_URL) # Parse the search results: ebay = parse_ebay(html) # Make a random selection, and print it out: buyme = random.choice(ebay) for i in buyme: print i