From 8959e6bee5baa5aa08a643c4bb4409ca3afef3e1 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Mon, 8 Mar 2021 15:48:05 +0800 Subject: [PATCH] assignment-2.1 version2.1: GET API fixed --- API/Functions.py | 21 ++++++ Crawler/BookPack.py | 6 +- Crawler/Main.py | 51 +++++++++++-- Crawler/Spider.py | 86 +++++++++++++++++---- DataBase/JsonParser.py | 53 +++++++++++++ DataBase/mongoDB.py | 120 ++++++++++++++++++++++++++++++ RegularExpressionParser/Parser.py | 78 +++++++++++++++++++ Server/SimpleServer.py | 72 ++++++++++++++++++ 8 files changed, 462 insertions(+), 25 deletions(-) diff --git a/API/Functions.py b/API/Functions.py index e69de29..96e4881 100644 --- a/API/Functions.py +++ b/API/Functions.py @@ -0,0 +1,21 @@ +import requests +import os +import RegularExpressionParser.Parser as parser +import re +from dotenv import load_dotenv + + +def get(query): + """ + function used to send get request to find one or several documentations + :param query: user input query + :return: json file of the given author or book + """ + load_dotenv() + host = os.getenv('SERVER_HOST') + url = host + parser.url_safe(query) + req = requests.get(url) + return req.json() + + + diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py index 17b404e..42d4162 100644 --- a/Crawler/BookPack.py +++ b/Crawler/BookPack.py @@ -1,15 +1,15 @@ class BookPackage: - def __init__(self, url, title, id, ISBN, author_url, author, rating, + def __init__(self, url, title, id, ISBN13, author_url, author, rating, rating_count, review_count, image_url, similar_books): self.url = url self.title = title self.id = id - self.ISBN = ISBN + self.ISBN13 = ISBN13 self.author_url = author_url self.author = author self.rating = rating self.rating_count = rating_count self.review_count = review_count self.image_url = image_url - self.similar_books = similar_books \ No newline at end of file + self.similar_books = similar_books diff --git a/Crawler/Main.py b/Crawler/Main.py index c839815..24a7e96 100644 --- a/Crawler/Main.py +++ b/Crawler/Main.py @@ -1,12 +1,49 @@ import Crawler.Spider as Spider -import Crawler.BookPack as BookPack -import Crawler.AuthorPack as AuthorPack +import time +from DataBase import mongoDB as db print('please enter URL of the book') -# url = input() -url = 'https://www.goodreads.com/book/show/3735293-clean-code' -print('URL received, start scraping') +url = input() +# url = https://www.goodreads.com/book/show/3735293-clean-code +# https://www.goodreads.com/book/show/35133922-educated +print('please enter the number of books you want to crawl') +book_num = input() +# bookNum = 3 +print('please enter the number of authors you want to crawl') +author_num = input() +# authorNum = 2 +print('Request received, start scraping') -startPage = Spider.Spider(url) -startPage.scrap() +book_dict = {} +author_dict = {} +similar_book_dict = {} +count = 1 + +print("Crawling book #" + str(count)) +start_page = Spider.Spider(url) +start_page.crawl(book_dict, author_dict, similar_book_dict) + + +while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): + all_related_books = list(similar_book_dict.items()) + for bookURLPair in all_related_books: + if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): + break + book_name = bookURLPair[0] + if book_name in book_dict: + pass + else: + count += 1 + print("Crawling book #" + str(count)) + bookURL = bookURLPair[1] + book_spider = Spider.Spider(bookURL) + book_spider.crawl(book_dict, author_dict, similar_book_dict) + + # since the network delay from asia to goodread.com is already very high, + # a small amount of time of sleep is used here. + time.sleep(0.2) + +db.insert_dicts(book_dict, 0) +db.insert_dicts(author_dict, 1) +print("Scraping completed") diff --git a/Crawler/Spider.py b/Crawler/Spider.py index 0b46824..a689789 100644 --- a/Crawler/Spider.py +++ b/Crawler/Spider.py @@ -1,5 +1,7 @@ import requests from bs4 import BeautifulSoup +import Crawler.BookPack as BookPack +import Crawler.AuthorPack as AuthorPack class Spider: @@ -8,26 +10,80 @@ class Spider: self.url = url self.soup = None - def scrap(self): - header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + def crawl(self, book_dict, author_dict, similar_book_dict): + header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} url = self.url res = requests.get(url, headers=header) - self.soup = BeautifulSoup(res.text, 'lxml') + self.soup = BeautifulSoup(res.text, "lxml") + titleAndAuthor = self.soup.title.string.split(" by ") + title = titleAndAuthor[0] + bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0] + rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text + rating = float(str(rating_raw).replace(" ", "").replace("\n", "")) + rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"]) + review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"]) + ISBN13_raw = self.soup.find("meta", property="books:isbn") + if ISBN13_raw["content"] == "null": + print("Warning: book '" + title + "' has no ISBN code") + ISBN13 = 0 + else: + ISBN13 = int(ISBN13_raw["content"]) + imageURL = self.soup.find("img", id="coverImage")["src"] + author = titleAndAuthor[1] + author_page_link = self.soup.find("a", {"class": "authorName"})["href"] + authorID = (author_page_link.split("show/")[1]).split(".")[0] + see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"] + res_similar = requests.get(see_more_link, headers=header) + similar_soup = BeautifulSoup(res_similar.text, 'lxml') + similar_books_and_authors = similar_soup.find_all("span", itemprop="name") + similar_books = [] + for i in range(len(similar_books_and_authors)): + if i % 2 == 0: + # a book + similar_book = similar_books_and_authors[i] + similar_book_name = similar_book.text + book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"] + similar_books.append(similar_book_name) + similar_book_dict[similar_book_name] = book_link - seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > ' - 'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > ' - 'div.bigBoxBody > div > a')[0].get("href") - print(seeMoreLink) + bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count, + review_count, imageURL, similar_books) + book_dict[title] = bookPKT + res_author = requests.get(author_page_link, headers=header) + author_soup = BeautifulSoup(res_author.text, 'lxml') + author_rating = float(author_soup.find("span", {"class": "average"}).text) + author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"]) + author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"]) + author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"] + related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False) + related_authors_link = "https://www.goodreads.com" + related_links[1]["href"] + res_related_authors = requests.get(related_authors_link, headers=header) + related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml') + related_authors = [] + related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"}) + for the_author in related_authors_raw: + related_author_name = the_author.text + if related_author_name == author: + pass + else: + related_authors.append(related_author_name) + author_books_link = "https://www.goodreads.com" + related_links[0]["href"] + res_author_books = requests.get(author_books_link, headers=header) + author_books_soup = BeautifulSoup(res_author_books.text, "lxml") + author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"}) + author_books = [] + for book in author_books_raw: + book_name = book.findChildren("span", recursive=False) + if not book_name: + pass + else: + book_name = book_name[0].text + author_books.append(book_name) + author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count, + author_review_count, author_photo_url, related_authors, author_books) + author_dict[author] = author_pkt -# print(soup) -# print(soup.title.string) -# print(soup.find_all('a')) -# aTypes = soup.find_all('a')[0] -# print(aTypes) -# for obj in aTypes: -# if obj. -# print(soup.) diff --git a/DataBase/JsonParser.py b/DataBase/JsonParser.py index e69de29..2a5e524 100644 --- a/DataBase/JsonParser.py +++ b/DataBase/JsonParser.py @@ -0,0 +1,53 @@ + +def parse_book_dict_to_json(dictionary): + item_list = list(dictionary.items()) + return_list = [] + for items in item_list: + book_package = items[1] + book_json = parse_book_to_json(book_package) + return_list.append(book_json) + return return_list + + +def parse_author_dict_to_json(dictionary): + item_list = list(dictionary.items()) + return_list = [] + for items in item_list: + author_package = items[1] + author_json = parse_author_to_json(author_package) + return_list.append(author_json) + return return_list + + +def parse_book_to_json(bookPKT): + book_json = { + "type": "book", + "book_url": bookPKT.url, + "title": bookPKT.title, + "id": bookPKT.id, + "ISBN": bookPKT.ISBN13, + "author_url": bookPKT.author_url, + "author": bookPKT.author, + "rating": bookPKT.rating, + "rating_count": bookPKT.rating_count, + "review_count": bookPKT.review_count, + "image_url": bookPKT.image_url, + "similar_books": bookPKT.similar_books + } + return book_json + + +def parse_author_to_json(authorPKT): + author_json = { + "type": "author", + "name": authorPKT.name, + "author_url": authorPKT.url, + "id": authorPKT.id, + "rating": authorPKT.rating, + "rating_count": authorPKT.rating_count, + "review_count": authorPKT.review_count, + "image_url": authorPKT.image_url, + "related_authors": authorPKT.related_authors, + "author_books": authorPKT.author_books + } + return author_json diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py index e69de29..c4e2222 100644 --- a/DataBase/mongoDB.py +++ b/DataBase/mongoDB.py @@ -0,0 +1,120 @@ +import json +import DataBase.JsonParser as parser +import os +from dotenv import load_dotenv +from pymongo import MongoClient + + +def get_db(): + """ return the database of goodReads crawler """ + load_dotenv() + url = os.getenv('MONGODB_URL') + client = MongoClient(url) + return client.get_database("crawler_db") + + +def insert_dicts(dictionary, opt): + """ + Insert books or authors collection in database + :param dictionary: the dictionary to insert to collection + :param opt: =0 means books collection; =1 means authors collection + :return: no return value + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return + json_list = [] + if opt == 0: + json_list = parser.parse_book_dict_to_json(dictionary) + elif opt == 1: + json_list = parser.parse_author_dict_to_json(dictionary) + records.insert_many(json_list) + + +def update_dicts(opt, identifier, content): + """ + Update documentations in a given collection + :param opt: =0 means books collection; =1 means authors collection + :param identifier: the identifier of the documentation we want to find + :param content: the content to update + :return: no return value + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + result = records.update_many( + identifier, + {"$set": content}, + ) + print("matched documentation: " + str(result.matched_count)) + print("modified documentation: " + str(result.modified_count)) + + +def get_documents_json(opt, identifier): + """ + find documentations specified by the identifier and output a json data + :param opt: =0 means books collection; =1 means authors collection + :param identifier: identifier of the documents we want, {} means locate the whole collection + :return: json file of selected documentations + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return json.dumps({}) + data = records.find(identifier) + file = {} + if opt == 0: + typeName = "books" + else: + typeName = "authors" + file[typeName] = [] + for item in data: + item.pop("_id") + file[typeName].append(item) + return json.dumps(file) + + +def download_collection(opt, identifier, name): + """ + download books collection or authors collection + :param opt: =0 means books collection; =1 means authors collection + :param identifier: identifier of the documents we want to download; + empty({}) means selected all documents in given collection + :param name: file name of downloaded json + :return: JSON file of the collection + """ + json_file = get_documents_json(opt, identifier) + load_dotenv() + root = os.getenv('ROOT') + with open(root + name + ".json", "w") as output: + output.write(json_file) + + +def clean(opt, identifier): + """ + delete specific documents in given collection + :param opt: =0 means books collection; =1 means authors collection + :param identifier: identifier of the documents we want to delete; + empty({}) means selected all documents in given collection + :return: no return value + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return + records.delete_many(identifier) diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index e69de29..27f4fe6 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -0,0 +1,78 @@ +import re + + +def check_if_address_valid(address): + """ + Take input address and scrape date + :param addr: + :return: + """ + x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address) + if x: + return True + else: + return False + + +def parse_query_to_url(query): + elements = re.findall("[0-9A-Za-z_\"><]+", query) + count = len(elements) + if count == 3: + # can only be A.B:C or wrong + if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2]) + else: + print("Invalid query.") + return "" + elif count == 4: + # a pair and one of [NOT, >, <]. + if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2]) + elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2]) + elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2]) + else: + print("Invalid query.") + return "" + elif 6 <= count <= 8: + # AND or OR operator + if re.search(".*\sAND\s.*", query): + parts = query.split(" AND ") + return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1]) + elif re.search(".*\sOR\s.*", query): + parts = query.split(" OR ") + return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1]) + else: + print("Invalid query.") + return "" + + +def parse_query_to_json(pair): + elements = re.findall("[0-9A-Za-z\"._]", pair) + count = len(elements) + print(count) + if count != 3 and count != 4: + print("Failed to parse query: invalid args number") + return {"wrong": "True"} # will never be founded in database + elif count == 3: + # can be A.B: C or A.B: "C" + if re.search("\".*\"", elements[2]): + return {elements[0] + "." + elements[1]: elements[2]} + else: + return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}} + else: + # can be NOT, >, or < + if elements[2] == "NOT": + return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}} + elif elements[2] == ">": + return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}} + elif elements[2] == "<": + return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}} + else: + print("Failed to parse query: unknown operator") + return {"wrong": "True"} + + +def url_safe(element): + return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C") \ No newline at end of file diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index e69de29..c416ace 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -0,0 +1,72 @@ +import RegularExpressionParser.Parser as parser +import DataBase.mongoDB as db +import re +from flask import Flask +from flask import request +from urllib.parse import urlparse, parse_qs + +app = Flask(__name__) +app.config["DEBUG"] = True + + +@app.route("/", methods=['GET']) +def home(): + return "200: successfully connected to home page\n" + + +@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"]) +def data(collection): + if request.method == "GET": + if collection == "books" or collection == "book": + identifier = request.args.to_dict() + print(identifier) + print(type(identifier)) + return "200: find" + elif collection == "authors" or collection == "author": + print(request.url) + return "200: find" + elif collection == "search": + url_parsed = urlparse(request.url) + qs_parsed = parse_qs(url_parsed.query) + return search_document(qs_parsed["q"][0].split("&")) + else: + return "404 not found" + # elif request.method == "PUT": + + +def search_document(identifiers): + if len(identifiers) == 1: + json_idt = parser.parse_query_to_json(identifiers[0]) + if re.search("^book.*", identifiers[0]): + return db.get_documents_json(0, json_idt) + else: + return db.get_documents_json(1, json_idt) + elif len(identifiers) == 3: + if re.search("^book.*", identifiers[0]): + if re.search("^author.*", identifiers[2]): + print("Failed to find documentation: two statements are not pointing to the same collection") + return {} + else: + opt = 0 + else: + if re.search("^book.*",identifiers[2]): + print("Failed to find documentation: two statements are not pointing to the same collection") + return {} + else: + opt = 1 + json_idt1 = parser.parse_query_to_json(identifiers[0]) + json_idt2 = parser.parse_query_to_json(identifiers[2]) + if identifiers[1] == "AND": + exp = {"$and": [json_idt1, json_idt2]} + elif identifiers[1] == "OR": + exp = {"$or": [json_idt1,json_idt2]} + else: + print("Failed to parse query: unknown operator for identifiers[1]") + return {} + return db.get_documents_json(opt, exp) + else: + return "Error, unknown identifiers" + + +if __name__ == "__main__": + app.run() -- GitLab