Skip to content
Snippets Groups Projects
Main.py 1.52 KiB
Newer Older
  • Learn to ignore specific revisions
  • import Crawler.Spider as Spider
    
    import time
    from DataBase import mongoDB as db
    
    
    
    print('please enter URL of the book')
    
    url = input()
    # url = https://www.goodreads.com/book/show/3735293-clean-code
    # https://www.goodreads.com/book/show/35133922-educated
    print('please enter the number of books you want to crawl')
    book_num = input()
    # bookNum = 3
    print('please enter the number of authors you want to crawl')
    author_num = input()
    # authorNum = 2
    print('Request received, start scraping')
    
    book_dict = {}
    author_dict = {}
    similar_book_dict = {}
    count = 1
    
    print("Crawling book #" + str(count))
    start_page = Spider.Spider(url)
    start_page.crawl(book_dict, author_dict, similar_book_dict)
    
    
    while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
        all_related_books = list(similar_book_dict.items())
        for bookURLPair in all_related_books:
            if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
                break
            book_name = bookURLPair[0]
            if book_name in book_dict:
                pass
            else:
                count += 1
                print("Crawling book #" + str(count))
                bookURL = bookURLPair[1]
                book_spider = Spider.Spider(bookURL)
                book_spider.crawl(book_dict, author_dict, similar_book_dict)
    
                # since the network delay from asia to goodread.com is already very high,
                # a small amount of time of sleep is used here.
                time.sleep(0.2)
    
    db.insert_dicts(book_dict, 0)
    db.insert_dicts(author_dict, 1)
    print("Scraping completed")