Skip to content
Snippets Groups Projects
Scrape.py 1.49 KiB
Newer Older
  • Learn to ignore specific revisions
  • import Crawler.Spider as Spider
    import time
    from DataBase import mongoDB as db
    
    
    def scrape_api(url_api, book_num_api, author_num_api):
        print('Request received, start scraping')
        book_dict_api = {}
        author_dict_api = {}
        similar_book_dict_api = {}
        count_api = 1
    
        print("Crawling book #" + str(count_api))
        start_page_api = Spider.Spider(url_api)
        start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
    
        while len(book_dict_api) < int(book_num_api) or len(author_dict_api) < int(author_num_api):
            all_related_books_api = list(similar_book_dict_api.items())
            for book_url_pair_api in all_related_books_api:
                if len(book_dict_api) >= int(book_num_api) and len(author_dict_api) >= int(author_num_api):
                    break
                book_name_api = book_url_pair_api[0]
                if book_name_api in book_dict_api:
                    pass
                else:
                    count_api += 1
                    print("Crawling book #" + str(count_api))
                    book_url_api = book_url_pair_api[1]
                    book_spider_api = Spider.Spider(book_url_api)
                    book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
    
                    # since the network delay from asia to goodread.com is already very high,
                    # a small amount of time of sleep is used here.
                    time.sleep(0.2)
    
        db.insert_dicts(book_dict_api, 0)
        db.insert_dicts(author_dict_api, 1)
        print("Scraping completed")