Newer
Older
import time
from DataBase import mongoDB as db
print('please enter URL of the book')
url = input()
# url = https://www.goodreads.com/book/show/3735293-clean-code
# https://www.goodreads.com/book/show/35133922-educated
print('please enter the number of books you want to crawl')
book_num = input()
# bookNum = 3
print('please enter the number of authors you want to crawl')
author_num = input()
# authorNum = 2
print('Request received, start scraping')
book_dict = {}
author_dict = {}
similar_book_dict = {}
count = 1
print("Crawling book #" + str(count))
start_page = Spider.Spider(url)
start_page.crawl(book_dict, author_dict, similar_book_dict)
while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
all_related_books = list(similar_book_dict.items())
for bookURLPair in all_related_books:
if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
break
book_name = bookURLPair[0]
if book_name in book_dict:
pass
else:
count += 1
print("Crawling book #" + str(count))
bookURL = bookURLPair[1]
book_spider = Spider.Spider(bookURL)
book_spider.crawl(book_dict, author_dict, similar_book_dict)
# since the network delay from asia to goodread.com is already very high,
# a small amount of time of sleep is used here.
time.sleep(0.2)
db.insert_dicts(book_dict, 0)
db.insert_dicts(author_dict, 1)
print("Scraping completed")