import urllib
import re
import socket
import os
from sets import Set
engrish_website_str = "http://www.engrish.com";
category_href_re = re.compile('"category_index.php?.*"');
detail_href_re = re.compile('"detail.php?.*"');
img_hre_re = re.compile('"image/engrish/.*"');
comments_re = re.compile('"Times New Roman, Times, serif">.*
');
quote_re = re.compile('"');
angular_bracket_re = re.compile('">');
angular_closing_bracket_re = re.compile('<');
category_next_href_re = re.compile('"/category_index.php?.*"');
next_re = re.compile('.*[nN]ext.*');
html_page_piece1 = "
d
";
html_page_piece3 = "Previous<\/a>\
\
\
\
\
\
\
\
\
"
html_page_piece5 = "Next"
global_download_file_names = [];
global_comments_strings = [];
def WriteHtmlFile (html_file_name, img_file_name, comments_str, prev_html, next_html):
comments_file_handle = open(html_file_name, "w");
comments_file_handle.write(html_page_piece1);
comments_file_handle.write(img_file_name);
comments_file_handle.write(html_page_piece2);
comments_file_handle.write(comments_str);
comments_file_handle.write(html_page_piece3);
comments_file_handle.write(prev_html);
comments_file_handle.write(html_page_piece4);
comments_file_handle.write(html_page_piece5);
comments_file_handle.write(next_html);
comments_file_handle.write(html_page_piece6);
comments_file_handle.close();
def CreateAllHtmlFiles(download_file_names_list, comment_strings_list):
if len(download_file_names_list) != len(comment_strings_list):
print "Number of files is not the same as the comments, exiting";
return None;
print "Starting to create the Html Files";
for i in range(0, len(download_file_names_list)) :
img_file_name = download_file_names_list[i];
comments_str = comment_strings_list[i];
prev_html ="";
next_html ="";
if ( i == 0):
prev_html = "";
next_html = download_file_names_list[i+1] + ".html" ;
else :
if ( i == (len(download_file_names_list) - 1) ):
next_html = ""
prev_html = download_file_names_list[i-1] + ".html";
else:
next_html = download_file_names_list[i+1] + ".html";
prev_html = download_file_names_list[i-1] + ".html";
WriteHtmlFile( str(img_file_name + ".html"), img_file_name, comments_str, prev_html, next_html);
def ExtractUrls ( buffer, href_re, split_quote_re):
temp_array = buffer.splitlines(False);
url_list = [];
for current_line in temp_array:
individual_url = GetCompleteUrl(current_line, href_re, quote_re);
if (individual_url != None):
url_list.append(individual_url);
file_handle.write(individual_url + "\n");
return url_list;
def GetCompleteUrl ( current_line, href_re, split_quote_re):
result_array = href_re.findall(current_line);
if ( result_array != None ) and (len(result_array) > 0 ):
if result_array[0] != None:
category_array = quote_re.split(result_array[0])
category_url = engrish_website_str + "/" + category_array[1];
return category_url;
return None;
def GetNextUrl ( current_line ):
result_array = valid_next_re.findall(current_line);
if ( result_array != None ) and (len(result_array) > 0 ):
if ( result_array[0] != None ):
print "next url" + result_array[0]
def ExtractComments ( buffer ):
for current_line in buffer.splitlines(False):
result_array = comments_re.findall(current_line);
if ( result_array != None ) and (len(result_array) > 0):
if result_array[0] != None :
temp_array = angular_bracket_re.split(result_array[0]);
temp_array2 = angular_closing_bracket_re.split(temp_array[1]);
return (temp_array2[0]);
return "";
def DownloadPicAndComment( comic_url):
handle = urllib.urlopen(comic_url);
buffer = handle.read();
handle.close();
download_file_name = "";
pic_url_list = ExtractUrls (buffer, img_hre_re, quote_re);
for pic_url in pic_url_list:
download_file_name = os.path.split(pic_url)[1];
handle = urllib.urlretrieve (pic_url, download_file_name);
comments_str = ExtractComments(buffer);
global_download_file_names.append(download_file_name);
global_comments_strings.append(comments_str);
def RecursivelyCollectComicPageUrls(comic_page_url_set, level2_set, current_url):
if current_url in level2_set:
print "already in the set";
else:
handle = urllib.urlopen(current_url);
buffer = handle.read();
handle.close();
url_list = ExtractUrls(buffer, detail_href_re, quote_re);
for url in url_list:
comic_page_url_set.add (url);
probable_list = ExtractUrls(buffer, category_next_href_re, quote_re);
for probable_url in probable_list:
RecursivelyCollectComicPageUrls(comic_page_url_list, level2_list, probable_url);
def CollectComicPageUrls(buffer):
complete_comic_page_set = Set();
complete_level2_set = Set();
url_list = ExtractUrls(buffer, detail_href_re, quote_re); # first collect the urls in this first level page
for url in url_list:
complete_comic_page_set.add(url);
probable_list = ExtractUrls(buffer, category_next_href_re,quote_re);
for probable_url in probable_list :
RecursivelyCollectComicPageUrls(complete_comic_page_set, complete_level2_set, probable_url);
return complete_comic_page_set;
#Main()
handle = urllib.urlopen(engrish_website_str);
buffer = handle.read();
handle.close();
print "Completed downloading webpage"
file_handle = open("links.txt", "w");
engrish_categories = []; # Contains list of all the categories on this website
comic_page_url_list = Set(); # Contains the list of all the comic pages regardless of the categories
engrish_categories = ExtractUrls(buffer, category_href_re, quote_re);
print "Num categories ", len(engrish_categories);
comic_page_url_set = Set();
for category_url in engrish_categories:
print "Opening url: " + category_url;
handle = urllib.urlopen(category_url);
buffer = handle.read();
handle.close();
temp_set = CollectComicPageUrls(buffer);
comic_page_url_set.union_update(temp_set);
print "Num comic page urls ", len(comic_page_url_set);
#category_url = engrish_categories[0];
#print "Opening url: " + category_url;
#handle = urllib.urlopen(category_url);
#buffer = handle.read();
#handle.close();
#temp_set = CollectComicPageUrls(buffer);
#comic_page_url_set.union_update(temp_set);
comic_page_url_list = [];
for comic in comic_page_url_set :
comic_page_url_list.append(comic);
print "Num comics ", len(comic_page_url_list);
counter = 0;
next_url = "";
prev_url = "";
num_comics = len(comic_page_url_list);
for url in comic_page_url_list:
counter += 1;
print "Downloading comic [" + counter + "/" + num_comics + "] : " + url;
DownloadPicAndComment(url);
print "Completed downloading comics. Writing to html files is left";
CreateAllHtmlFiles(global_download_file_names, global_comments_strings);
file_handle.close();