Spider.py

import requests
from bs4 import BeautifulSoup
import Crawler.BookPack as BookPack
import Crawler.AuthorPack as AuthorPack


class Spider:

    def __init__(self, url):
        self.url = url
        self.soup = None

    def crawl(self, book_dict, author_dict, similar_book_dict):
        header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                                "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}

        url = self.url
        res = requests.get(url, headers=header)
        self.soup = BeautifulSoup(res.text, "lxml")
        titleAndAuthor = self.soup.title.string.split(" by ")
        title = titleAndAuthor[0]
        bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
        rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text
        rating = float(str(rating_raw).replace(" ", "").replace("\n", ""))
        rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"])
        review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"])
        ISBN13_raw = self.soup.find("meta", property="books:isbn")
        if ISBN13_raw["content"] == "null":
            print("Warning: book '" + title + "' has no ISBN code")
            ISBN13 = 0
        else:
            ISBN13 = int(ISBN13_raw["content"])
        imageURL = self.soup.find("img", id="coverImage")["src"]
        author = titleAndAuthor[1]
        author_page_link = self.soup.find("a", {"class": "authorName"})["href"]
        authorID = (author_page_link.split("show/")[1]).split(".")[0]
        see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
        res_similar = requests.get(see_more_link, headers=header)
        similar_soup = BeautifulSoup(res_similar.text, 'lxml')
        similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
        similar_books = []
        for i in range(len(similar_books_and_authors)):
            if i % 2 == 0:
                # a book
                similar_book = similar_books_and_authors[i]
                similar_book_name = similar_book.text
                book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"]
                similar_books.append(similar_book_name)
                similar_book_dict[similar_book_name] = book_link

        bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
                                       review_count, imageURL, similar_books)
        book_dict[title] = bookPKT
        res_author = requests.get(author_page_link, headers=header)
        author_soup = BeautifulSoup(res_author.text, 'lxml')
        author_rating = float(author_soup.find("span", {"class": "average"}).text)
        author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"])
        author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"])
        author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"]
        related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False)

        related_authors_link = "https://www.goodreads.com" + related_links[1]["href"]
        res_related_authors = requests.get(related_authors_link, headers=header)
        related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml')
        related_authors = []
        related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"})
        for the_author in related_authors_raw:
            related_author_name = the_author.text
            if related_author_name == author:
                pass
            else:
                related_authors.append(related_author_name)
        author_books_link = "https://www.goodreads.com" + related_links[0]["href"]
        res_author_books = requests.get(author_books_link, headers=header)
        author_books_soup = BeautifulSoup(res_author_books.text, "lxml")
        author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"})
        author_books = []
        for book in author_books_raw:
            book_name = book.findChildren("span", recursive=False)
            if not book_name:
                pass
            else:
                book_name = book_name[0].text
                author_books.append(book_name)
        author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count,
                                             author_review_count, author_photo_url, related_authors, author_books)
        author_dict[author] = author_pkt