import requests from bs4 import BeautifulSoup import Crawler.BookPack as BookPack import Crawler.AuthorPack as AuthorPack class Spider: def __init__(self, url): self.url = url self.soup = None def crawl(self, book_dict, author_dict, similar_book_dict): header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} url = self.url res = requests.get(url, headers=header) self.soup = BeautifulSoup(res.text, "lxml") titleAndAuthor = self.soup.title.string.split(" by ") title = titleAndAuthor[0] bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0] rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text rating = float(str(rating_raw).replace(" ", "").replace("\n", "")) rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"]) review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"]) ISBN13_raw = self.soup.find("meta", property="books:isbn") if ISBN13_raw["content"] == "null": print("Warning: book '" + title + "' has no ISBN code") ISBN13 = 0 else: ISBN13 = int(ISBN13_raw["content"]) imageURL = self.soup.find("img", id="coverImage")["src"] author = titleAndAuthor[1] author_page_link = self.soup.find("a", {"class": "authorName"})["href"] authorID = (author_page_link.split("show/")[1]).split(".")[0] see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"] res_similar = requests.get(see_more_link, headers=header) similar_soup = BeautifulSoup(res_similar.text, 'lxml') similar_books_and_authors = similar_soup.find_all("span", itemprop="name") similar_books = [] for i in range(len(similar_books_and_authors)): if i % 2 == 0: # a book similar_book = similar_books_and_authors[i] similar_book_name = similar_book.text book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"] similar_books.append(similar_book_name) similar_book_dict[similar_book_name] = book_link bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count, review_count, imageURL, similar_books) book_dict[title] = bookPKT res_author = requests.get(author_page_link, headers=header) author_soup = BeautifulSoup(res_author.text, 'lxml') author_rating = float(author_soup.find("span", {"class": "average"}).text) author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"]) author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"]) author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"] related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False) related_authors_link = "https://www.goodreads.com" + related_links[1]["href"] res_related_authors = requests.get(related_authors_link, headers=header) related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml') related_authors = [] related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"}) for the_author in related_authors_raw: related_author_name = the_author.text if related_author_name == author: pass else: related_authors.append(related_author_name) author_books_link = "https://www.goodreads.com" + related_links[0]["href"] res_author_books = requests.get(author_books_link, headers=header) author_books_soup = BeautifulSoup(res_author_books.text, "lxml") author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"}) author_books = [] for book in author_books_raw: book_name = book.findChildren("span", recursive=False) if not book_name: pass else: book_name = book_name[0].text author_books.append(book_name) author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count, author_review_count, author_photo_url, related_authors, author_books) author_dict[author] = author_pkt