import Crawler.Spider as Spider import time from DataBase import mongoDB as db print('please enter URL of the book') url = input() # url = https://www.goodreads.com/book/show/3735293-clean-code # https://www.goodreads.com/book/show/35133922-educated print('please enter the number of books you want to crawl') book_num = input() # bookNum = 3 print('please enter the number of authors you want to crawl') author_num = input() # authorNum = 2 print('Request received, start scraping') book_dict = {} author_dict = {} similar_book_dict = {} count = 1 print("Crawling book #" + str(count)) start_page = Spider.Spider(url) start_page.crawl(book_dict, author_dict, similar_book_dict) while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): all_related_books = list(similar_book_dict.items()) for bookURLPair in all_related_books: if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): break book_name = bookURLPair[0] if book_name in book_dict: pass else: count += 1 print("Crawling book #" + str(count)) bookURL = bookURLPair[1] book_spider = Spider.Spider(bookURL) book_spider.crawl(book_dict, author_dict, similar_book_dict) # since the network delay from asia to goodread.com is already very high, # a small amount of time of sleep is used here. time.sleep(0.2) db.insert_dicts(book_dict, 0) db.insert_dicts(author_dict, 1) print("Scraping completed")