import Crawler.Spider as Spider import time import RegularExpressionParser.Parser as Parser from DataBase import mongoDB as db def scrape_api(url_api, book_num_api, author_num_api): print('Request received, start scraping') book_dict_api = {} author_dict_api = {} similar_book_dict_api = {} count_api = 1 print("Crawling book #" + str(count_api)) start_page_api = Spider.Spider(url_api) start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api): all_related_books_api = list(similar_book_dict_api.items()) for book_url_pair_api in all_related_books_api: if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api): break book_name_api = book_url_pair_api[0] if book_name_api in book_dict_api: pass else: count_api += 1 print("Crawling book #" + str(count_api)) book_url_api = book_url_pair_api[1] book_spider_api = Spider.Spider(book_url_api) book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) # since the network delay from asia to goodread.com is already very high, # a small amount of time of sleep is used here. time.sleep(0.2) db.insert_dicts(book_dict_api, 0) db.insert_dicts(author_dict_api, 1) print("Scraping completed") print('please enter URL of the book') url = input() print('please enter the number of books you want to crawl') book_num = input() print('please enter the number of authors you want to crawl') author_num = input() print('Request received, start scraping') book_dict = {} author_dict = {} similar_book_dict = {} count = 1 print("Crawling book #" + str(count)) start_page = Spider.Spider(url) start_page.crawl(book_dict, author_dict, similar_book_dict) while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): all_related_books = list(similar_book_dict.items()) for book_url_pair in all_related_books: if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): break book_name = book_url_pair[0] if book_name in book_dict: pass else: count += 1 print("Crawling book #" + str(count)) book_url = book_url_pair[1] book_spider = Spider.Spider(book_url) book_spider.crawl(book_dict, author_dict, similar_book_dict) # since the network delay from asia to goodread.com is already very high, # a small amount of time of sleep is used here. time.sleep(0.2) db.insert_dicts(book_dict, 0) db.insert_dicts(author_dict, 1) print("Scraping completed")