Skip to content
Snippets Groups Projects
Spider.py 4.59 KiB
Newer Older
  • Learn to ignore specific revisions
  • import requests
    from bs4 import BeautifulSoup
    
    import Crawler.BookPack as BookPack
    import Crawler.AuthorPack as AuthorPack
    
    
    
    class Spider:
    
        def __init__(self, url):
            self.url = url
            self.soup = None
    
    
        def crawl(self, book_dict, author_dict, similar_book_dict):
            header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    
                                    "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
    
            url = self.url
            res = requests.get(url, headers=header)
    
            self.soup = BeautifulSoup(res.text, "lxml")
            titleAndAuthor = self.soup.title.string.split(" by ")
            title = titleAndAuthor[0]
            bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
            rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text
            rating = float(str(rating_raw).replace(" ", "").replace("\n", ""))
            rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"])
            review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"])
            ISBN13_raw = self.soup.find("meta", property="books:isbn")
            if ISBN13_raw["content"] == "null":
                print("Warning: book '" + title + "' has no ISBN code")
                ISBN13 = 0
            else:
                ISBN13 = int(ISBN13_raw["content"])
            imageURL = self.soup.find("img", id="coverImage")["src"]
            author = titleAndAuthor[1]
            author_page_link = self.soup.find("a", {"class": "authorName"})["href"]
            authorID = (author_page_link.split("show/")[1]).split(".")[0]
            see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
            res_similar = requests.get(see_more_link, headers=header)
            similar_soup = BeautifulSoup(res_similar.text, 'lxml')
            similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
            similar_books = []
            for i in range(len(similar_books_and_authors)):
                if i % 2 == 0:
                    # a book
                    similar_book = similar_books_and_authors[i]
                    similar_book_name = similar_book.text
                    book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"]
                    similar_books.append(similar_book_name)
                    similar_book_dict[similar_book_name] = book_link
    
            bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
                                           review_count, imageURL, similar_books)
            book_dict[title] = bookPKT
            res_author = requests.get(author_page_link, headers=header)
            author_soup = BeautifulSoup(res_author.text, 'lxml')
            author_rating = float(author_soup.find("span", {"class": "average"}).text)
            author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"])
            author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"])
            author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"]
            related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False)
    
            related_authors_link = "https://www.goodreads.com" + related_links[1]["href"]
            res_related_authors = requests.get(related_authors_link, headers=header)
            related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml')
            related_authors = []
            related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"})
            for the_author in related_authors_raw:
                related_author_name = the_author.text
                if related_author_name == author:
                    pass
                else:
                    related_authors.append(related_author_name)
            author_books_link = "https://www.goodreads.com" + related_links[0]["href"]
            res_author_books = requests.get(author_books_link, headers=header)
            author_books_soup = BeautifulSoup(res_author_books.text, "lxml")
            author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"})
            author_books = []
            for book in author_books_raw:
                book_name = book.findChildren("span", recursive=False)
                if not book_name:
                    pass
                else:
                    book_name = book_name[0].text
                    author_books.append(book_name)
            author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count,
                                                 author_review_count, author_photo_url, related_authors, author_books)
            author_dict[author] = author_pkt