import urllib import re import socket import os from sets import Set engrish_website_str = "http://www.engrish.com"; category_href_re = re.compile('"category_index.php?.*"'); detail_href_re = re.compile('"detail.php?.*"'); img_hre_re = re.compile('"image/engrish/.*"'); comments_re = re.compile('"Times New Roman, Times, serif">.*
'); quote_re = re.compile('"'); angular_bracket_re = re.compile('">'); angular_closing_bracket_re = re.compile('<'); category_next_href_re = re.compile('"/category_index.php?.*"'); next_re = re.compile('.*[nN]ext.*'); html_page_piece1 = "d\"tabasco\"

"; html_page_piece3 = "Previous<\/a>\        \        \        \        \        \        \        \        \        " html_page_piece5 = "Next" global_download_file_names = []; global_comments_strings = []; def WriteHtmlFile (html_file_name, img_file_name, comments_str, prev_html, next_html): comments_file_handle = open(html_file_name, "w"); comments_file_handle.write(html_page_piece1); comments_file_handle.write(img_file_name); comments_file_handle.write(html_page_piece2); comments_file_handle.write(comments_str); comments_file_handle.write(html_page_piece3); comments_file_handle.write(prev_html); comments_file_handle.write(html_page_piece4); comments_file_handle.write(html_page_piece5); comments_file_handle.write(next_html); comments_file_handle.write(html_page_piece6); comments_file_handle.close(); def CreateAllHtmlFiles(download_file_names_list, comment_strings_list): if len(download_file_names_list) != len(comment_strings_list): print "Number of files is not the same as the comments, exiting"; return None; print "Starting to create the Html Files"; for i in range(0, len(download_file_names_list)) : img_file_name = download_file_names_list[i]; comments_str = comment_strings_list[i]; prev_html =""; next_html =""; if ( i == 0): prev_html = ""; next_html = download_file_names_list[i+1] + ".html" ; else : if ( i == (len(download_file_names_list) - 1) ): next_html = "" prev_html = download_file_names_list[i-1] + ".html"; else: next_html = download_file_names_list[i+1] + ".html"; prev_html = download_file_names_list[i-1] + ".html"; WriteHtmlFile( str(img_file_name + ".html"), img_file_name, comments_str, prev_html, next_html); def ExtractUrls ( buffer, href_re, split_quote_re): temp_array = buffer.splitlines(False); url_list = []; for current_line in temp_array: individual_url = GetCompleteUrl(current_line, href_re, quote_re); if (individual_url != None): url_list.append(individual_url); file_handle.write(individual_url + "\n"); return url_list; def GetCompleteUrl ( current_line, href_re, split_quote_re): result_array = href_re.findall(current_line); if ( result_array != None ) and (len(result_array) > 0 ): if result_array[0] != None: category_array = quote_re.split(result_array[0]) category_url = engrish_website_str + "/" + category_array[1]; return category_url; return None; def GetNextUrl ( current_line ): result_array = valid_next_re.findall(current_line); if ( result_array != None ) and (len(result_array) > 0 ): if ( result_array[0] != None ): print "next url" + result_array[0] def ExtractComments ( buffer ): for current_line in buffer.splitlines(False): result_array = comments_re.findall(current_line); if ( result_array != None ) and (len(result_array) > 0): if result_array[0] != None : temp_array = angular_bracket_re.split(result_array[0]); temp_array2 = angular_closing_bracket_re.split(temp_array[1]); return (temp_array2[0]); return ""; def DownloadPicAndComment( comic_url): handle = urllib.urlopen(comic_url); buffer = handle.read(); handle.close(); download_file_name = ""; pic_url_list = ExtractUrls (buffer, img_hre_re, quote_re); for pic_url in pic_url_list: download_file_name = os.path.split(pic_url)[1]; handle = urllib.urlretrieve (pic_url, download_file_name); comments_str = ExtractComments(buffer); global_download_file_names.append(download_file_name); global_comments_strings.append(comments_str); def RecursivelyCollectComicPageUrls(comic_page_url_set, level2_set, current_url): if current_url in level2_set: print "already in the set"; else: handle = urllib.urlopen(current_url); buffer = handle.read(); handle.close(); url_list = ExtractUrls(buffer, detail_href_re, quote_re); for url in url_list: comic_page_url_set.add (url); probable_list = ExtractUrls(buffer, category_next_href_re, quote_re); for probable_url in probable_list: RecursivelyCollectComicPageUrls(comic_page_url_list, level2_list, probable_url); def CollectComicPageUrls(buffer): complete_comic_page_set = Set(); complete_level2_set = Set(); url_list = ExtractUrls(buffer, detail_href_re, quote_re); # first collect the urls in this first level page for url in url_list: complete_comic_page_set.add(url); probable_list = ExtractUrls(buffer, category_next_href_re,quote_re); for probable_url in probable_list : RecursivelyCollectComicPageUrls(complete_comic_page_set, complete_level2_set, probable_url); return complete_comic_page_set; #Main() handle = urllib.urlopen(engrish_website_str); buffer = handle.read(); handle.close(); print "Completed downloading webpage" file_handle = open("links.txt", "w"); engrish_categories = []; # Contains list of all the categories on this website comic_page_url_list = Set(); # Contains the list of all the comic pages regardless of the categories engrish_categories = ExtractUrls(buffer, category_href_re, quote_re); print "Num categories ", len(engrish_categories); comic_page_url_set = Set(); for category_url in engrish_categories: print "Opening url: " + category_url; handle = urllib.urlopen(category_url); buffer = handle.read(); handle.close(); temp_set = CollectComicPageUrls(buffer); comic_page_url_set.union_update(temp_set); print "Num comic page urls ", len(comic_page_url_set); #category_url = engrish_categories[0]; #print "Opening url: " + category_url; #handle = urllib.urlopen(category_url); #buffer = handle.read(); #handle.close(); #temp_set = CollectComicPageUrls(buffer); #comic_page_url_set.union_update(temp_set); comic_page_url_list = []; for comic in comic_page_url_set : comic_page_url_list.append(comic); print "Num comics ", len(comic_page_url_list); counter = 0; next_url = ""; prev_url = ""; num_comics = len(comic_page_url_list); for url in comic_page_url_list: counter += 1; print "Downloading comic [" + counter + "/" + num_comics + "] : " + url; DownloadPicAndComment(url); print "Completed downloading comics. Writing to html files is left"; CreateAllHtmlFiles(global_download_file_names, global_comments_strings); file_handle.close();