Newer
Older
import requests
from bs4 import BeautifulSoup
import Crawler.BookPack as BookPack
import Crawler.AuthorPack as AuthorPack
class Spider:
def __init__(self, url):
self.url = url
self.soup = None
def crawl(self, book_dict, author_dict, similar_book_dict):
header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = self.url
res = requests.get(url, headers=header)
self.soup = BeautifulSoup(res.text, "lxml")
titleAndAuthor = self.soup.title.string.split(" by ")
title = titleAndAuthor[0]
bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text
rating = float(str(rating_raw).replace(" ", "").replace("\n", ""))
rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"])
review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"])
ISBN13_raw = self.soup.find("meta", property="books:isbn")
if ISBN13_raw["content"] == "null":
print("Warning: book '" + title + "' has no ISBN code")
ISBN13 = 0
else:
ISBN13 = int(ISBN13_raw["content"])
imageURL = self.soup.find("img", id="coverImage")["src"]
author = titleAndAuthor[1]
author_page_link = self.soup.find("a", {"class": "authorName"})["href"]
authorID = (author_page_link.split("show/")[1]).split(".")[0]
see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
res_similar = requests.get(see_more_link, headers=header)
similar_soup = BeautifulSoup(res_similar.text, 'lxml')
similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
similar_books = []
for i in range(len(similar_books_and_authors)):
if i % 2 == 0:
# a book
similar_book = similar_books_and_authors[i]
similar_book_name = similar_book.text
book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"]
similar_books.append(similar_book_name)
similar_book_dict[similar_book_name] = book_link
bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
review_count, imageURL, similar_books)
book_dict[title] = bookPKT
res_author = requests.get(author_page_link, headers=header)
author_soup = BeautifulSoup(res_author.text, 'lxml')
author_rating = float(author_soup.find("span", {"class": "average"}).text)
author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"])
author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"])
author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"]
related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False)
related_authors_link = "https://www.goodreads.com" + related_links[1]["href"]
res_related_authors = requests.get(related_authors_link, headers=header)
related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml')
related_authors = []
related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"})
for the_author in related_authors_raw:
related_author_name = the_author.text
if related_author_name == author:
pass
else:
related_authors.append(related_author_name)
author_books_link = "https://www.goodreads.com" + related_links[0]["href"]
res_author_books = requests.get(author_books_link, headers=header)
author_books_soup = BeautifulSoup(res_author_books.text, "lxml")
author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"})
author_books = []
for book in author_books_raw:
book_name = book.findChildren("span", recursive=False)
if not book_name:
pass
else:
book_name = book_name[0].text
author_books.append(book_name)
author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count,
author_review_count, author_photo_url, related_authors, author_books)
author_dict[author] = author_pkt