From e51da8b907a087f9e7a22c30523156d424625b76 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 2 Mar 2021 07:50:31 +0800 Subject: [PATCH 1/9] assignment2.0 ver1.0: basic classes added --- Crawler/AuthorPack.py | 13 +++++++++++++ Crawler/BookPack.py | 15 +++++++++++++++ Crawler/Main.py | 12 ++++++++++++ Crawler/Spider.py | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+) create mode 100644 Crawler/AuthorPack.py create mode 100644 Crawler/BookPack.py create mode 100644 Crawler/Main.py create mode 100644 Crawler/Spider.py diff --git a/Crawler/AuthorPack.py b/Crawler/AuthorPack.py new file mode 100644 index 0000000..896b8e8 --- /dev/null +++ b/Crawler/AuthorPack.py @@ -0,0 +1,13 @@ + +class AuthorPackage: + def __init__(self, name, url, id, rating, rating_count, + review_count, image_url, related_authors, author_books): + self.name = name + self.url = url + self.id = id + self.rating = rating + self.rating_count = rating_count + self.review_count = review_count + self.image_url = image_url + self.related_authors = related_authors + self.author_books = author_books diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py new file mode 100644 index 0000000..17b404e --- /dev/null +++ b/Crawler/BookPack.py @@ -0,0 +1,15 @@ + +class BookPackage: + def __init__(self, url, title, id, ISBN, author_url, author, rating, + rating_count, review_count, image_url, similar_books): + self.url = url + self.title = title + self.id = id + self.ISBN = ISBN + self.author_url = author_url + self.author = author + self.rating = rating + self.rating_count = rating_count + self.review_count = review_count + self.image_url = image_url + self.similar_books = similar_books \ No newline at end of file diff --git a/Crawler/Main.py b/Crawler/Main.py new file mode 100644 index 0000000..c839815 --- /dev/null +++ b/Crawler/Main.py @@ -0,0 +1,12 @@ +import Crawler.Spider as Spider +import Crawler.BookPack as BookPack +import Crawler.AuthorPack as AuthorPack + + +print('please enter URL of the book') +# url = input() +url = 'https://www.goodreads.com/book/show/3735293-clean-code' +print('URL received, start scraping') + +startPage = Spider.Spider(url) +startPage.scrap() diff --git a/Crawler/Spider.py b/Crawler/Spider.py new file mode 100644 index 0000000..0b46824 --- /dev/null +++ b/Crawler/Spider.py @@ -0,0 +1,33 @@ +import requests +from bs4 import BeautifulSoup + + +class Spider: + + def __init__(self, url): + self.url = url + self.soup = None + + def scrap(self): + header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} + + url = self.url + res = requests.get(url, headers=header) + self.soup = BeautifulSoup(res.text, 'lxml') + + seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > ' + 'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > ' + 'div.bigBoxBody > div > a')[0].get("href") + print(seeMoreLink) + + + +# print(soup) +# print(soup.title.string) +# print(soup.find_all('a')) +# aTypes = soup.find_all('a')[0] +# print(aTypes) +# for obj in aTypes: +# if obj. +# print(soup.) -- GitLab From 186c3acb1bee3d63b43b6021fe700ac7b7e858ed Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 2 Mar 2021 11:51:59 +0800 Subject: [PATCH 2/9] assignment2.0 ver2.0: crawler completed, database package added and in progress --- DataBase/mongoDB.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 DataBase/mongoDB.py diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py new file mode 100644 index 0000000..e69de29 -- GitLab From c2b4b0397f8f97e70cfd191b4d60b427f6137e6a Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 2 Mar 2021 12:59:13 +0800 Subject: [PATCH 3/9] assignment2.0 ver2.1: dataBase uploading ready --- DataBase/JsonParser.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 DataBase/JsonParser.py diff --git a/DataBase/JsonParser.py b/DataBase/JsonParser.py new file mode 100644 index 0000000..e69de29 -- GitLab From b2347adc0582f590bda3015bec84e154d623b0d8 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Mon, 8 Mar 2021 01:28:49 +0800 Subject: [PATCH 4/9] assignment2.1 initial commit --- CommandLineInterface/CLI.py | 0 RegularExpressionParser/Parser.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 CommandLineInterface/CLI.py create mode 100644 RegularExpressionParser/Parser.py diff --git a/CommandLineInterface/CLI.py b/CommandLineInterface/CLI.py new file mode 100644 index 0000000..e69de29 diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py new file mode 100644 index 0000000..e69de29 -- GitLab From 77982c85941ccb03f81e824b575016418e55aba0 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Mon, 8 Mar 2021 15:21:29 +0800 Subject: [PATCH 5/9] assignment-2.1 version2: query parser finished, GET API finished --- .../CLI.py => API/Functions.py | 0 Client/CLI.py | 21 +++++++++++++++++++ Server/SimpleServer.py | 0 3 files changed, 21 insertions(+) rename CommandLineInterface/CLI.py => API/Functions.py (100%) create mode 100644 Client/CLI.py create mode 100644 Server/SimpleServer.py diff --git a/CommandLineInterface/CLI.py b/API/Functions.py similarity index 100% rename from CommandLineInterface/CLI.py rename to API/Functions.py diff --git a/Client/CLI.py b/Client/CLI.py new file mode 100644 index 0000000..b3c40d6 --- /dev/null +++ b/Client/CLI.py @@ -0,0 +1,21 @@ +import typer + +app = typer.Typer() + +@app.command() +def scrape_book(url: str): + """ Function used to create scrape request to Server """ + + + +@app.command() +def goodbye(name: str, formal: bool = False): + """ function2 """ + if formal: + typer.echo(f"Goodbye Ms. {name}. Have a good day.") + else: + typer.echo(f"Bye {name}!") + + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py new file mode 100644 index 0000000..e69de29 -- GitLab From 8959e6bee5baa5aa08a643c4bb4409ca3afef3e1 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Mon, 8 Mar 2021 15:48:05 +0800 Subject: [PATCH 6/9] assignment-2.1 version2.1: GET API fixed --- API/Functions.py | 21 ++++++ Crawler/BookPack.py | 6 +- Crawler/Main.py | 51 +++++++++++-- Crawler/Spider.py | 86 +++++++++++++++++---- DataBase/JsonParser.py | 53 +++++++++++++ DataBase/mongoDB.py | 120 ++++++++++++++++++++++++++++++ RegularExpressionParser/Parser.py | 78 +++++++++++++++++++ Server/SimpleServer.py | 72 ++++++++++++++++++ 8 files changed, 462 insertions(+), 25 deletions(-) diff --git a/API/Functions.py b/API/Functions.py index e69de29..96e4881 100644 --- a/API/Functions.py +++ b/API/Functions.py @@ -0,0 +1,21 @@ +import requests +import os +import RegularExpressionParser.Parser as parser +import re +from dotenv import load_dotenv + + +def get(query): + """ + function used to send get request to find one or several documentations + :param query: user input query + :return: json file of the given author or book + """ + load_dotenv() + host = os.getenv('SERVER_HOST') + url = host + parser.url_safe(query) + req = requests.get(url) + return req.json() + + + diff --git a/Crawler/BookPack.py b/Crawler/BookPack.py index 17b404e..42d4162 100644 --- a/Crawler/BookPack.py +++ b/Crawler/BookPack.py @@ -1,15 +1,15 @@ class BookPackage: - def __init__(self, url, title, id, ISBN, author_url, author, rating, + def __init__(self, url, title, id, ISBN13, author_url, author, rating, rating_count, review_count, image_url, similar_books): self.url = url self.title = title self.id = id - self.ISBN = ISBN + self.ISBN13 = ISBN13 self.author_url = author_url self.author = author self.rating = rating self.rating_count = rating_count self.review_count = review_count self.image_url = image_url - self.similar_books = similar_books \ No newline at end of file + self.similar_books = similar_books diff --git a/Crawler/Main.py b/Crawler/Main.py index c839815..24a7e96 100644 --- a/Crawler/Main.py +++ b/Crawler/Main.py @@ -1,12 +1,49 @@ import Crawler.Spider as Spider -import Crawler.BookPack as BookPack -import Crawler.AuthorPack as AuthorPack +import time +from DataBase import mongoDB as db print('please enter URL of the book') -# url = input() -url = 'https://www.goodreads.com/book/show/3735293-clean-code' -print('URL received, start scraping') +url = input() +# url = https://www.goodreads.com/book/show/3735293-clean-code +# https://www.goodreads.com/book/show/35133922-educated +print('please enter the number of books you want to crawl') +book_num = input() +# bookNum = 3 +print('please enter the number of authors you want to crawl') +author_num = input() +# authorNum = 2 +print('Request received, start scraping') -startPage = Spider.Spider(url) -startPage.scrap() +book_dict = {} +author_dict = {} +similar_book_dict = {} +count = 1 + +print("Crawling book #" + str(count)) +start_page = Spider.Spider(url) +start_page.crawl(book_dict, author_dict, similar_book_dict) + + +while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): + all_related_books = list(similar_book_dict.items()) + for bookURLPair in all_related_books: + if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): + break + book_name = bookURLPair[0] + if book_name in book_dict: + pass + else: + count += 1 + print("Crawling book #" + str(count)) + bookURL = bookURLPair[1] + book_spider = Spider.Spider(bookURL) + book_spider.crawl(book_dict, author_dict, similar_book_dict) + + # since the network delay from asia to goodread.com is already very high, + # a small amount of time of sleep is used here. + time.sleep(0.2) + +db.insert_dicts(book_dict, 0) +db.insert_dicts(author_dict, 1) +print("Scraping completed") diff --git a/Crawler/Spider.py b/Crawler/Spider.py index 0b46824..a689789 100644 --- a/Crawler/Spider.py +++ b/Crawler/Spider.py @@ -1,5 +1,7 @@ import requests from bs4 import BeautifulSoup +import Crawler.BookPack as BookPack +import Crawler.AuthorPack as AuthorPack class Spider: @@ -8,26 +10,80 @@ class Spider: self.url = url self.soup = None - def scrap(self): - header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + def crawl(self, book_dict, author_dict, similar_book_dict): + header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} url = self.url res = requests.get(url, headers=header) - self.soup = BeautifulSoup(res.text, 'lxml') + self.soup = BeautifulSoup(res.text, "lxml") + titleAndAuthor = self.soup.title.string.split(" by ") + title = titleAndAuthor[0] + bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0] + rating_raw = self.soup.find("span", {"itemprop": "ratingValue"}).text + rating = float(str(rating_raw).replace(" ", "").replace("\n", "")) + rating_count = int(self.soup.find("meta", itemprop="ratingCount")["content"]) + review_count = int(self.soup.find("meta", itemprop="reviewCount")["content"]) + ISBN13_raw = self.soup.find("meta", property="books:isbn") + if ISBN13_raw["content"] == "null": + print("Warning: book '" + title + "' has no ISBN code") + ISBN13 = 0 + else: + ISBN13 = int(ISBN13_raw["content"]) + imageURL = self.soup.find("img", id="coverImage")["src"] + author = titleAndAuthor[1] + author_page_link = self.soup.find("a", {"class": "authorName"})["href"] + authorID = (author_page_link.split("show/")[1]).split(".")[0] + see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"] + res_similar = requests.get(see_more_link, headers=header) + similar_soup = BeautifulSoup(res_similar.text, 'lxml') + similar_books_and_authors = similar_soup.find_all("span", itemprop="name") + similar_books = [] + for i in range(len(similar_books_and_authors)): + if i % 2 == 0: + # a book + similar_book = similar_books_and_authors[i] + similar_book_name = similar_book.text + book_link = "https://www.goodreads.com" + similar_book.find_parent("a")["href"] + similar_books.append(similar_book_name) + similar_book_dict[similar_book_name] = book_link - seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > ' - 'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > ' - 'div.bigBoxBody > div > a')[0].get("href") - print(seeMoreLink) + bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count, + review_count, imageURL, similar_books) + book_dict[title] = bookPKT + res_author = requests.get(author_page_link, headers=header) + author_soup = BeautifulSoup(res_author.text, 'lxml') + author_rating = float(author_soup.find("span", {"class": "average"}).text) + author_rating_count = float(author_soup.find("span", {"class": "value-title"})["title"]) + author_review_count = float((author_soup.find_all("span", {"class": "value-title"})[1])["title"]) + author_photo_url = author_soup.find("img", {"itemprop": "image"})["src"] + related_links = author_soup.find("div", {"class": "hreview-aggregate"}).findChildren("a", recursive=False) + related_authors_link = "https://www.goodreads.com" + related_links[1]["href"] + res_related_authors = requests.get(related_authors_link, headers=header) + related_author_soup = BeautifulSoup(res_related_authors.text, 'lxml') + related_authors = [] + related_authors_raw = related_author_soup.find_all("span", {"itemprop": "name"}) + for the_author in related_authors_raw: + related_author_name = the_author.text + if related_author_name == author: + pass + else: + related_authors.append(related_author_name) + author_books_link = "https://www.goodreads.com" + related_links[0]["href"] + res_author_books = requests.get(author_books_link, headers=header) + author_books_soup = BeautifulSoup(res_author_books.text, "lxml") + author_books_raw = author_books_soup.find_all("a", {"class": "bookTitle"}) + author_books = [] + for book in author_books_raw: + book_name = book.findChildren("span", recursive=False) + if not book_name: + pass + else: + book_name = book_name[0].text + author_books.append(book_name) + author_pkt = AuthorPack.AuthorPackage(author, author_page_link, authorID, author_rating, author_rating_count, + author_review_count, author_photo_url, related_authors, author_books) + author_dict[author] = author_pkt -# print(soup) -# print(soup.title.string) -# print(soup.find_all('a')) -# aTypes = soup.find_all('a')[0] -# print(aTypes) -# for obj in aTypes: -# if obj. -# print(soup.) diff --git a/DataBase/JsonParser.py b/DataBase/JsonParser.py index e69de29..2a5e524 100644 --- a/DataBase/JsonParser.py +++ b/DataBase/JsonParser.py @@ -0,0 +1,53 @@ + +def parse_book_dict_to_json(dictionary): + item_list = list(dictionary.items()) + return_list = [] + for items in item_list: + book_package = items[1] + book_json = parse_book_to_json(book_package) + return_list.append(book_json) + return return_list + + +def parse_author_dict_to_json(dictionary): + item_list = list(dictionary.items()) + return_list = [] + for items in item_list: + author_package = items[1] + author_json = parse_author_to_json(author_package) + return_list.append(author_json) + return return_list + + +def parse_book_to_json(bookPKT): + book_json = { + "type": "book", + "book_url": bookPKT.url, + "title": bookPKT.title, + "id": bookPKT.id, + "ISBN": bookPKT.ISBN13, + "author_url": bookPKT.author_url, + "author": bookPKT.author, + "rating": bookPKT.rating, + "rating_count": bookPKT.rating_count, + "review_count": bookPKT.review_count, + "image_url": bookPKT.image_url, + "similar_books": bookPKT.similar_books + } + return book_json + + +def parse_author_to_json(authorPKT): + author_json = { + "type": "author", + "name": authorPKT.name, + "author_url": authorPKT.url, + "id": authorPKT.id, + "rating": authorPKT.rating, + "rating_count": authorPKT.rating_count, + "review_count": authorPKT.review_count, + "image_url": authorPKT.image_url, + "related_authors": authorPKT.related_authors, + "author_books": authorPKT.author_books + } + return author_json diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py index e69de29..c4e2222 100644 --- a/DataBase/mongoDB.py +++ b/DataBase/mongoDB.py @@ -0,0 +1,120 @@ +import json +import DataBase.JsonParser as parser +import os +from dotenv import load_dotenv +from pymongo import MongoClient + + +def get_db(): + """ return the database of goodReads crawler """ + load_dotenv() + url = os.getenv('MONGODB_URL') + client = MongoClient(url) + return client.get_database("crawler_db") + + +def insert_dicts(dictionary, opt): + """ + Insert books or authors collection in database + :param dictionary: the dictionary to insert to collection + :param opt: =0 means books collection; =1 means authors collection + :return: no return value + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return + json_list = [] + if opt == 0: + json_list = parser.parse_book_dict_to_json(dictionary) + elif opt == 1: + json_list = parser.parse_author_dict_to_json(dictionary) + records.insert_many(json_list) + + +def update_dicts(opt, identifier, content): + """ + Update documentations in a given collection + :param opt: =0 means books collection; =1 means authors collection + :param identifier: the identifier of the documentation we want to find + :param content: the content to update + :return: no return value + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + result = records.update_many( + identifier, + {"$set": content}, + ) + print("matched documentation: " + str(result.matched_count)) + print("modified documentation: " + str(result.modified_count)) + + +def get_documents_json(opt, identifier): + """ + find documentations specified by the identifier and output a json data + :param opt: =0 means books collection; =1 means authors collection + :param identifier: identifier of the documents we want, {} means locate the whole collection + :return: json file of selected documentations + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return json.dumps({}) + data = records.find(identifier) + file = {} + if opt == 0: + typeName = "books" + else: + typeName = "authors" + file[typeName] = [] + for item in data: + item.pop("_id") + file[typeName].append(item) + return json.dumps(file) + + +def download_collection(opt, identifier, name): + """ + download books collection or authors collection + :param opt: =0 means books collection; =1 means authors collection + :param identifier: identifier of the documents we want to download; + empty({}) means selected all documents in given collection + :param name: file name of downloaded json + :return: JSON file of the collection + """ + json_file = get_documents_json(opt, identifier) + load_dotenv() + root = os.getenv('ROOT') + with open(root + name + ".json", "w") as output: + output.write(json_file) + + +def clean(opt, identifier): + """ + delete specific documents in given collection + :param opt: =0 means books collection; =1 means authors collection + :param identifier: identifier of the documents we want to delete; + empty({}) means selected all documents in given collection + :return: no return value + """ + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return + records.delete_many(identifier) diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index e69de29..27f4fe6 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -0,0 +1,78 @@ +import re + + +def check_if_address_valid(address): + """ + Take input address and scrape date + :param addr: + :return: + """ + x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address) + if x: + return True + else: + return False + + +def parse_query_to_url(query): + elements = re.findall("[0-9A-Za-z_\"><]+", query) + count = len(elements) + if count == 3: + # can only be A.B:C or wrong + if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2]) + else: + print("Invalid query.") + return "" + elif count == 4: + # a pair and one of [NOT, >, <]. + if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2]) + elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2]) + elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query): + return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2]) + else: + print("Invalid query.") + return "" + elif 6 <= count <= 8: + # AND or OR operator + if re.search(".*\sAND\s.*", query): + parts = query.split(" AND ") + return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1]) + elif re.search(".*\sOR\s.*", query): + parts = query.split(" OR ") + return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1]) + else: + print("Invalid query.") + return "" + + +def parse_query_to_json(pair): + elements = re.findall("[0-9A-Za-z\"._]", pair) + count = len(elements) + print(count) + if count != 3 and count != 4: + print("Failed to parse query: invalid args number") + return {"wrong": "True"} # will never be founded in database + elif count == 3: + # can be A.B: C or A.B: "C" + if re.search("\".*\"", elements[2]): + return {elements[0] + "." + elements[1]: elements[2]} + else: + return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}} + else: + # can be NOT, >, or < + if elements[2] == "NOT": + return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}} + elif elements[2] == ">": + return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}} + elif elements[2] == "<": + return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}} + else: + print("Failed to parse query: unknown operator") + return {"wrong": "True"} + + +def url_safe(element): + return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C") \ No newline at end of file diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index e69de29..c416ace 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -0,0 +1,72 @@ +import RegularExpressionParser.Parser as parser +import DataBase.mongoDB as db +import re +from flask import Flask +from flask import request +from urllib.parse import urlparse, parse_qs + +app = Flask(__name__) +app.config["DEBUG"] = True + + +@app.route("/", methods=['GET']) +def home(): + return "200: successfully connected to home page\n" + + +@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"]) +def data(collection): + if request.method == "GET": + if collection == "books" or collection == "book": + identifier = request.args.to_dict() + print(identifier) + print(type(identifier)) + return "200: find" + elif collection == "authors" or collection == "author": + print(request.url) + return "200: find" + elif collection == "search": + url_parsed = urlparse(request.url) + qs_parsed = parse_qs(url_parsed.query) + return search_document(qs_parsed["q"][0].split("&")) + else: + return "404 not found" + # elif request.method == "PUT": + + +def search_document(identifiers): + if len(identifiers) == 1: + json_idt = parser.parse_query_to_json(identifiers[0]) + if re.search("^book.*", identifiers[0]): + return db.get_documents_json(0, json_idt) + else: + return db.get_documents_json(1, json_idt) + elif len(identifiers) == 3: + if re.search("^book.*", identifiers[0]): + if re.search("^author.*", identifiers[2]): + print("Failed to find documentation: two statements are not pointing to the same collection") + return {} + else: + opt = 0 + else: + if re.search("^book.*",identifiers[2]): + print("Failed to find documentation: two statements are not pointing to the same collection") + return {} + else: + opt = 1 + json_idt1 = parser.parse_query_to_json(identifiers[0]) + json_idt2 = parser.parse_query_to_json(identifiers[2]) + if identifiers[1] == "AND": + exp = {"$and": [json_idt1, json_idt2]} + elif identifiers[1] == "OR": + exp = {"$or": [json_idt1,json_idt2]} + else: + print("Failed to parse query: unknown operator for identifiers[1]") + return {} + return db.get_documents_json(opt, exp) + else: + return "Error, unknown identifiers" + + +if __name__ == "__main__": + app.run() -- GitLab From a3565f17c3de72df48b40d29615e6ed5095d7fe9 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 9 Mar 2021 00:53:56 +0800 Subject: [PATCH 7/9] assignment-2.1 version3: API all deployed, CLI initially finished --- API/Functions.py | 38 +++++- API/__init__.py | 0 Client/CLI.py | 183 ++++++++++++++++++++++++++-- Client/__init__.py | 0 Crawler/Main.py | 48 ++++++-- Crawler/__init__.py | 0 DataBase/__init__.py | 0 DataBase/mongoDB.py | 22 +++- RegularExpressionParser/Parser.py | 42 ++++--- RegularExpressionParser/__init__.py | 0 Server/SimpleServer.py | 91 ++++++++++---- Server/__init__.py | 0 12 files changed, 360 insertions(+), 64 deletions(-) create mode 100644 API/__init__.py create mode 100644 Client/__init__.py create mode 100644 Crawler/__init__.py create mode 100644 DataBase/__init__.py create mode 100644 RegularExpressionParser/__init__.py create mode 100644 Server/__init__.py diff --git a/API/Functions.py b/API/Functions.py index 96e4881..452887c 100644 --- a/API/Functions.py +++ b/API/Functions.py @@ -1,21 +1,49 @@ import requests import os -import RegularExpressionParser.Parser as parser -import re +import RegularExpressionParser.Parser as Parser from dotenv import load_dotenv +def safe_url(query): + load_dotenv() + host = os.getenv('SERVER_HOST') + return host + Parser.url_safe(query) + + def get(query): """ function used to send get request to find one or several documentations :param query: user input query :return: json file of the given author or book """ - load_dotenv() - host = os.getenv('SERVER_HOST') - url = host + parser.url_safe(query) + print("GET: " + query) + url = safe_url(query) req = requests.get(url) + print("response code: " + str(req.status_code)) return req.json() +def put(query, json_file): + print("PUT: " + query) + url = safe_url(query) + req = requests.put(url, json=json_file) + print("response code: " + str(req.status_code)) + return req.text + + +def post(query, json_file): + print("POST: " + query) + url = safe_url(query) + req = requests.put(url, json=json_file) + print("response code: " + str(req.status_code)) + return req.text + + +def delete(query): + print("DELETE: " + query) + url = safe_url(query) + req = requests.delete(url) + print("response code: " + str(req.status_code)) + return req.text + diff --git a/API/__init__.py b/API/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Client/CLI.py b/Client/CLI.py index b3c40d6..8104183 100644 --- a/Client/CLI.py +++ b/Client/CLI.py @@ -1,21 +1,186 @@ import typer +import RegularExpressionParser.Parser as Parser +import Crawler.Main as Main +import API.Functions as API +import os +import json +from dotenv import load_dotenv app = typer.Typer() + @app.command() -def scrape_book(url: str): - """ Function used to create scrape request to Server """ +def main_page(): + print("Welcome to use goodReads scraper!\n" + "This is main page\n" + "Usage:\n" + "scrape: scrape goodRead and load data to database\n" + "get: find a document for a book or an author from database\n" + "put: create or update a document in database\n" + "post: add one or multiple documents of book or author in database \n" + "delete: delete certain document from database\n" + "export: export a certain collection from database\n" + "exit: end the program") + order = input() + if order == "scrape": + scrape() + elif order == "get": + get() + elif order == "put": + put() + elif order == "post": + post() + elif order == "delete": + delete() + elif order == "export": + export() + elif order == "exit": + exit() + else: + print("Unknown command") + main_page() +def scrape(): + """ + Function used to create scrape request to Server + Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py + """ + print('please enter URL of the book') + url = input() + print('please enter the number of books you want to crawl') + book_num = input() + if not book_num.isdigit(): + print("max book num must be an integer") + main_page() + book_num = int(book_num) + print('please enter the number of authors you want to crawl') + author_num = input() + if not author_num.isdigit(): + print("max author num must be an integer") + main_page() + author_num = int(author_num) + print('Request received, start scraping') + if not Parser.check_if_address_valid(url): + print("Error: invalid URL") + main_page() + elif book_num >= 2000 or author_num >= 2000: + print("Error, the number of book and author should be lower than 2000") + main_page() + else: + if book_num > 200 or author_num > 50: + print("Warning, maximum of the number of books or authors is large") + Main.scrape_api(url, book_num, author_num) + main_page() -@app.command() -def goodbye(name: str, formal: bool = False): - """ function2 """ - if formal: - typer.echo(f"Goodbye Ms. {name}. Have a good day.") + +def get(): + print("Please enter your query:") + query = input() + result = API.get(query) + typer.echo(result) + main_page() + + +def put(): + print("Please enter your query:") + query = input() + print("Please select one way to load update info: input or read:") + load = input() + if load == "input": + print("Please enter json data:") + string = input() + try: + data = json.loads(string) + except ValueError as err: + print(err) + main_page() + elif load == "read": + load_dotenv() + file_root = os.getenv('FILE_ROOT') + print("Please enter the name of file which is in the file root directory:") + file_path = r"" + file_root + input() + if os.path.isfile(file_path): + with open(file_path, "r") as file: + try: + data = json.load(file) + except ValueError as err: + print(err) + main_page() + else: + print("Error: file not exist") + main_page() else: - typer.echo(f"Bye {name}!") + print("Unknown command") + main_page() + result = API.put(query, data) + typer.echo(result) + main_page() + + +def post(): + print("Please enter your query:") + query = input() + load_dotenv() + file_root = os.getenv('FILE_ROOT') + print("Please enter the name of file which is in the file root directory:") + file_path = r"" + file_root + input() + if os.path.isfile(file_path): + with open(file_path, "r") as file: + try: + data = json.load(file) + except ValueError as err: + print(err) + main_page() + else: + print("Error: file not exist") + main_page() + result = API.post(query, data) + typer.echo(result) + main_page() + + +def delete(): + print("Please enter your query:") + query = input() + result = API.delete(query) + typer.echo(result) + main_page() + + +def export(): + print("Which collection do you want to export? books or authors?") + collection = input() + if collection == "books": + json = API.get("book") + load_dotenv() + file_root = os.getenv('FILE_ROOT') + with open(file_root + "books" + ".json", "w") as output: + output.write(json) + print("books.json is stored at " + file_root) + main_page() + elif collection == "authors": + json = API.get("author") + load_dotenv() + file_root = os.getenv('FILE_ROOT') + with open(file_root + "authors" + ".json", "w") as output: + output.write(json) + print("authors.json is stored at " + file_root) + main_page() + elif collection == "back": + main_page() + else: + print("Unknown collection") + main_page() + + +# def goodbye(name: str, formal: bool = False): +# """ function2 """ +# if formal: +# typer.echo(f"Goodbye Ms. {name}. Have a good day.") +# else: +# typer.echo(f"Bye {name}!") if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/Client/__init__.py b/Client/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Crawler/Main.py b/Crawler/Main.py index 24a7e96..c778b1c 100644 --- a/Crawler/Main.py +++ b/Crawler/Main.py @@ -1,18 +1,50 @@ import Crawler.Spider as Spider import time +import RegularExpressionParser.Parser as Parser from DataBase import mongoDB as db +def scrape_api(url_api, book_num_api, author_num_api): + print('Request received, start scraping') + book_dict_api = {} + author_dict_api = {} + similar_book_dict_api = {} + count_api = 1 + + print("Crawling book #" + str(count_api)) + start_page_api = Spider.Spider(url_api) + start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api): + all_related_books_api = list(similar_book_dict_api.items()) + for book_url_pair_api in all_related_books_api: + if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api): + break + book_name_api = book_url_pair_api[0] + if book_name_api in book_dict_api: + pass + else: + count_api += 1 + print("Crawling book #" + str(count_api)) + book_url_api = book_url_pair_api[1] + book_spider_api = Spider.Spider(book_url_api) + book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + # since the network delay from asia to goodread.com is already very high, + # a small amount of time of sleep is used here. + time.sleep(0.2) + + db.insert_dicts(book_dict_api, 0) + db.insert_dicts(author_dict_api, 1) + print("Scraping completed") + + print('please enter URL of the book') url = input() -# url = https://www.goodreads.com/book/show/3735293-clean-code -# https://www.goodreads.com/book/show/35133922-educated print('please enter the number of books you want to crawl') book_num = input() -# bookNum = 3 print('please enter the number of authors you want to crawl') author_num = input() -# authorNum = 2 print('Request received, start scraping') book_dict = {} @@ -27,17 +59,17 @@ start_page.crawl(book_dict, author_dict, similar_book_dict) while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): all_related_books = list(similar_book_dict.items()) - for bookURLPair in all_related_books: + for book_url_pair in all_related_books: if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): break - book_name = bookURLPair[0] + book_name = book_url_pair[0] if book_name in book_dict: pass else: count += 1 print("Crawling book #" + str(count)) - bookURL = bookURLPair[1] - book_spider = Spider.Spider(bookURL) + book_url = book_url_pair[1] + book_spider = Spider.Spider(book_url) book_spider.crawl(book_dict, author_dict, similar_book_dict) # since the network delay from asia to goodread.com is already very high, diff --git a/Crawler/__init__.py b/Crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/DataBase/__init__.py b/DataBase/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py index c4e2222..4095fa3 100644 --- a/DataBase/mongoDB.py +++ b/DataBase/mongoDB.py @@ -13,6 +13,18 @@ def get_db(): return client.get_database("crawler_db") +def insert_document(docu, opt): + db = get_db() + if opt == 0: + records = db.books + elif opt == 1: + records = db.authors + else: + print("failed to get json file: wrong opt for selecting collection") + return + records.insert_one(docu) + + def insert_dicts(dictionary, opt): """ Insert books or authors collection in database @@ -49,9 +61,13 @@ def update_dicts(opt, identifier, content): records = db.books elif opt == 1: records = db.authors - result = records.update_many( + else: + print("failed to get json file: wrong opt for selecting collection") + return + result = records.update_one( identifier, {"$set": content}, + upsert=True ) print("matched documentation: " + str(result.matched_count)) print("modified documentation: " + str(result.modified_count)) @@ -96,8 +112,8 @@ def download_collection(opt, identifier, name): """ json_file = get_documents_json(opt, identifier) load_dotenv() - root = os.getenv('ROOT') - with open(root + name + ".json", "w") as output: + file_root = os.getenv('FILE_ROOT') + with open(file_root + name + ".json", "w") as output: output.write(json_file) diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index 27f4fe6..fc6f628 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -49,29 +49,35 @@ def parse_query_to_url(query): def parse_query_to_json(pair): - elements = re.findall("[0-9A-Za-z\"._]", pair) + print(pair) + elements = re.findall("[0-9A-Za-z\"_.]+", pair) count = len(elements) - print(count) - if count != 3 and count != 4: + if count != 2: print("Failed to parse query: invalid args number") return {"wrong": "True"} # will never be founded in database - elif count == 3: - # can be A.B: C or A.B: "C" - if re.search("\".*\"", elements[2]): - return {elements[0] + "." + elements[1]: elements[2]} - else: - return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}} else: - # can be NOT, >, or < - if elements[2] == "NOT": - return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}} - elif elements[2] == ">": - return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}} - elif elements[2] == "<": - return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}} + # can be A.B: C or A.B: "C" + if re.search(":", pair): + if re.search("^[0-9.]*$", elements[1]): + return {elements[0].split(".")[1]: float(elements[1])} + else: + if re.search("\".*\"", elements[1]): + return {elements[0].split(".")[1]: elements[1]} + else: + return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}} else: - print("Failed to parse query: unknown operator") - return {"wrong": "True"} + if re.search("NOT", pair): + if re.search("^[0-9.]*$", elements[1]): + return {elements[0].split(".")[1]: {"$ne": float(elements[1])}} + else: + return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}} + elif re.search(">", pair): + return {elements[0].split(".")[1]: {"$gt": float(elements[1])}} + elif re.search("<", pair): + return {elements[0].split(".")[1]: {"$lt": float(elements[1])}} + else: + print("Failed to parse query: unknown operator") + return {"wrong": "True"} def url_safe(element): diff --git a/RegularExpressionParser/__init__.py b/RegularExpressionParser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index c416ace..ae10ffb 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -1,12 +1,12 @@ -import RegularExpressionParser.Parser as parser -import DataBase.mongoDB as db +import DataBase.mongoDB as DataBase +import RegularExpressionParser.Parser as Parser import re from flask import Flask from flask import request from urllib.parse import urlparse, parse_qs app = Flask(__name__) -app.config["DEBUG"] = True +# app.config["DEBUG"] = True @app.route("/", methods=['GET']) @@ -15,32 +15,79 @@ def home(): @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"]) -def data(collection): +def data_base(collection): if request.method == "GET": - if collection == "books" or collection == "book": - identifier = request.args.to_dict() - print(identifier) - print(type(identifier)) - return "200: find" - elif collection == "authors" or collection == "author": - print(request.url) - return "200: find" + if collection == "book": + url_parsed = urlparse(request.url) + qs_parsed = parse_qs(url_parsed.query) + # print(qs_parsed["id"]) + if qs_parsed == {}: + return DataBase.get_documents_json(0, {}) + return search_document(["book.id:" + qs_parsed["id"][0]]) + elif collection == "author": + url_parsed = urlparse(request.url) + qs_parsed = parse_qs(url_parsed.query) + # print(type(qs_parsed["id"])) + if qs_parsed == {}: + return DataBase.get_documents_json(1, {}) + return search_document(["author.id:" + qs_parsed["id"][0]]) elif collection == "search": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) + print(qs_parsed) return search_document(qs_parsed["q"][0].split("&")) else: - return "404 not found" - # elif request.method == "PUT": + return "404: Unknown Collection to GET" + elif request.method == "PUT": + if request.headers["Content-Type"] != "application/json": + return "415: content should be JSON file" + json_update_info = request.json + if collection == "book": + opt = 0 + elif collection == "author": + opt = 1 + else: + return "404: Unknown Collection to GET" + DataBase.update_dicts(opt, request.args.to_dict(), json_update_info) + return "200: PUT succeeded" + elif request.method == "POST": + if request.headers["Content-Type"] != "application/json": + return "415: content should be JSON file" + json_file = request.json + if collection == "books": + DataBase.insert_dicts(json_file, 0) + elif collection == "authors": + DataBase.insert_dicts(json_file, 1) + elif collection == "book": + DataBase.insert_document(json_file, 0) + elif collection == "author": + DataBase.insert_document(json_file, 1) + elif collection == "scrape": + return "200" + else: + return "404: Unknown Collection to POST" + return "201: POST succeeded" + elif request.method == "DELETE": + identifier = request.args.to_dict() + print(identifier) + if collection == "book": + opt = 0 + elif collection == "author": + opt = 1 + else: + return "404: Unknown Collection to DELETE" + DataBase.clean(opt, identifier) + return "200: DELETE succeeded" def search_document(identifiers): if len(identifiers) == 1: - json_idt = parser.parse_query_to_json(identifiers[0]) + json_idt = Parser.parse_query_to_json(identifiers[0]) + print(json_idt) if re.search("^book.*", identifiers[0]): - return db.get_documents_json(0, json_idt) + return DataBase.get_documents_json(0, json_idt) else: - return db.get_documents_json(1, json_idt) + return DataBase.get_documents_json(1, json_idt) elif len(identifiers) == 3: if re.search("^book.*", identifiers[0]): if re.search("^author.*", identifiers[2]): @@ -54,16 +101,18 @@ def search_document(identifiers): return {} else: opt = 1 - json_idt1 = parser.parse_query_to_json(identifiers[0]) - json_idt2 = parser.parse_query_to_json(identifiers[2]) + json_idt1 = Parser.parse_query_to_json(identifiers[0]) + json_idt2 = Parser.parse_query_to_json(identifiers[2]) if identifiers[1] == "AND": exp = {"$and": [json_idt1, json_idt2]} elif identifiers[1] == "OR": - exp = {"$or": [json_idt1,json_idt2]} + exp = {"$or": [json_idt1, json_idt2]} else: print("Failed to parse query: unknown operator for identifiers[1]") return {} - return db.get_documents_json(opt, exp) + print("exp:") + print(exp) + return DataBase.get_documents_json(opt, exp) else: return "Error, unknown identifiers" diff --git a/Server/__init__.py b/Server/__init__.py new file mode 100644 index 0000000..e69de29 -- GitLab From 155ae6b1e1c409b2eb4291fef1a240ca8ad54924 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 9 Mar 2021 05:49:02 +0800 Subject: [PATCH 8/9] assignment2.1 version3: part3 finished, start testing --- API/Functions.py | 10 +- Client/CLI.py | 197 ++++++++++++++++++------------ Crawler/Main.py | 81 ------------ Crawler/Scrape.py | 38 ++++++ Crawler/Spider.py | 13 ++ RegularExpressionParser/Parser.py | 4 +- Server/SimpleServer.py | 19 ++- 7 files changed, 189 insertions(+), 173 deletions(-) delete mode 100644 Crawler/Main.py create mode 100644 Crawler/Scrape.py diff --git a/API/Functions.py b/API/Functions.py index 452887c..4858e26 100644 --- a/API/Functions.py +++ b/API/Functions.py @@ -19,7 +19,7 @@ def get(query): print("GET: " + query) url = safe_url(query) req = requests.get(url) - print("response code: " + str(req.status_code)) + print("\nresponse code: " + str(req.status_code) + "\n") return req.json() @@ -27,15 +27,15 @@ def put(query, json_file): print("PUT: " + query) url = safe_url(query) req = requests.put(url, json=json_file) - print("response code: " + str(req.status_code)) + print("\nresponse code: " + str(req.status_code) + "\n") return req.text def post(query, json_file): print("POST: " + query) url = safe_url(query) - req = requests.put(url, json=json_file) - print("response code: " + str(req.status_code)) + req = requests.post(url, json=json_file) + print("\nresponse code: " + str(req.status_code) + "\n") return req.text @@ -43,7 +43,7 @@ def delete(query): print("DELETE: " + query) url = safe_url(query) req = requests.delete(url) - print("response code: " + str(req.status_code)) + print("\nresponse code: " + str(req.status_code) + "\n") return req.text diff --git a/Client/CLI.py b/Client/CLI.py index 8104183..3bb87ef 100644 --- a/Client/CLI.py +++ b/Client/CLI.py @@ -1,9 +1,10 @@ import typer import RegularExpressionParser.Parser as Parser -import Crawler.Main as Main +import Crawler.Scrape as Scrape import API.Functions as API import os import json +import re from dotenv import load_dotenv app = typer.Typer() @@ -11,20 +12,24 @@ app = typer.Typer() @app.command() def main_page(): - print("Welcome to use goodReads scraper!\n" - "This is main page\n" - "Usage:\n" - "scrape: scrape goodRead and load data to database\n" - "get: find a document for a book or an author from database\n" - "put: create or update a document in database\n" - "post: add one or multiple documents of book or author in database \n" - "delete: delete certain document from database\n" - "export: export a certain collection from database\n" - "exit: end the program") + """ + main page of client, directly run this .py to enter and see usage + :return: none + """ + print("==================================================================\n\n" + "Welcome to use goodReads scraper!\n\n" + "This is main page\n\n" + "You can input back to return to this page at anytime\n\n" + "Usage:\n\n" + "get: find a document for a book or an author from database\n\n" + "put: create or update a document in database\n\n" + "post: add one or multiple documents of book or author in database \n\n" + "delete: delete certain document from database\n\n" + "export: export a certain collection from database\n\n" + "exit: end the program\n\n" + "Please enter your command:\n") order = input() - if order == "scrape": - scrape() - elif order == "get": + if order == "get": get() elif order == "put": put() @@ -34,152 +39,186 @@ def main_page(): delete() elif order == "export": export() - elif order == "exit": + if order == "exit": exit() else: print("Unknown command") main_page() -def scrape(): +def check_and_back(command): + """ check if used in put back and return to main page if so """ + if command == "back": + main_page() + + +def back_to_main_page(): + """ wait for user to press enter and then return to the main page """ + print("press enter to return to main page") + no_use = input() + main_page() + + +def scrape(query): """ Function used to create scrape request to Server - Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py """ - print('please enter URL of the book') - url = input() - print('please enter the number of books you want to crawl') - book_num = input() + attributes = query.split("?")[1].split("&") + print(attributes) + url = attributes[0].split("=", 1)[1] + book_num = attributes[1].split("=")[1] if not book_num.isdigit(): print("max book num must be an integer") - main_page() + back_to_main_page() book_num = int(book_num) - print('please enter the number of authors you want to crawl') - author_num = input() + author_num = attributes[2].split("=")[1] if not author_num.isdigit(): print("max author num must be an integer") - main_page() + back_to_main_page() author_num = int(author_num) print('Request received, start scraping') if not Parser.check_if_address_valid(url): print("Error: invalid URL") - main_page() - elif book_num >= 2000 or author_num >= 2000: - print("Error, the number of book and author should be lower than 2000") - main_page() + back_to_main_page() + elif book_num >= 2000 or author_num >= 500 or book_num <= 0 or author_num <= 0: + print("Error, the number of book and author should be positive and lower than 2000/500") + back_to_main_page() else: if book_num > 200 or author_num > 50: print("Warning, maximum of the number of books or authors is large") - Main.scrape_api(url, book_num, author_num) - main_page() + result = API.post(query, {}) + return result def get(): - print("Please enter your query:") + """ GET API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() + check_and_back(query) result = API.get(query) typer.echo(result) - main_page() + print() + back_to_main_page() def put(): - print("Please enter your query:") + """ PUT API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() - print("Please select one way to load update info: input or read:") + check_and_back(query) + print("Please select one way to load update info: input or read:\n") load = input() + check_and_back(load) if load == "input": - print("Please enter json data:") + print("Please enter json data:\n") string = input() + check_and_back(string) try: data = json.loads(string) except ValueError as err: print(err) - main_page() + back_to_main_page() elif load == "read": load_dotenv() file_root = os.getenv('FILE_ROOT') - print("Please enter the name of file which is in the file root directory:") - file_path = r"" + file_root + input() + print("Please enter the name of file which is in the file root directory:\n") + name = input() + check_and_back(name) + file_path = r"" + file_root + name if os.path.isfile(file_path): with open(file_path, "r") as file: try: data = json.load(file) except ValueError as err: print(err) - main_page() + back_to_main_page() else: print("Error: file not exist") - main_page() + back_to_main_page() else: print("Unknown command") - main_page() + back_to_main_page() result = API.put(query, data) typer.echo(result) - main_page() + print() + back_to_main_page() def post(): - print("Please enter your query:") + """ POST API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() - load_dotenv() - file_root = os.getenv('FILE_ROOT') - print("Please enter the name of file which is in the file root directory:") - file_path = r"" + file_root + input() - if os.path.isfile(file_path): - with open(file_path, "r") as file: - try: - data = json.load(file) - except ValueError as err: - print(err) - main_page() + check_and_back(query) + if re.search("scrape", query): + result = scrape(query) + typer.echo(result) + print() + back_to_main_page() else: - print("Error: file not exist") - main_page() - result = API.post(query, data) - typer.echo(result) - main_page() + load_dotenv() + file_root = os.getenv('FILE_ROOT') + print("Please enter the name of file which is in the file root directory:\n") + name = input() + check_and_back(name) + file_path = r"" + file_root + name + if os.path.isfile(file_path): + with open(file_path, "r") as file: + try: + data = json.load(file) + except ValueError as err: + print(err) + back_to_main_page() + else: + print("Error: file not exist") + back_to_main_page() + result = API.post(query, data) + typer.echo(result) + print() + back_to_main_page() def delete(): - print("Please enter your query:") + """ DELETE API """ + print("==================================================================\n") + print("Please enter your query:\n") query = input() + check_and_back(query) result = API.delete(query) typer.echo(result) - main_page() + print() + back_to_main_page() def export(): - print("Which collection do you want to export? books or authors?") + """ export the book or author collection """ + print("==================================================================\n") + print("Which collection do you want to export: books or authors?\n") collection = input() + check_and_back(collection) if collection == "books": - json = API.get("book") + json_file = API.get("book") load_dotenv() file_root = os.getenv('FILE_ROOT') with open(file_root + "books" + ".json", "w") as output: - output.write(json) + output.write(json_file) print("books.json is stored at " + file_root) - main_page() + back_to_main_page() elif collection == "authors": - json = API.get("author") + json_file = API.get("author") load_dotenv() file_root = os.getenv('FILE_ROOT') with open(file_root + "authors" + ".json", "w") as output: - output.write(json) + output.write(json_file) print("authors.json is stored at " + file_root) - main_page() + back_to_main_page() elif collection == "back": - main_page() + back_to_main_page() else: print("Unknown collection") - main_page() - - -# def goodbye(name: str, formal: bool = False): -# """ function2 """ -# if formal: -# typer.echo(f"Goodbye Ms. {name}. Have a good day.") -# else: -# typer.echo(f"Bye {name}!") + back_to_main_page() if __name__ == "__main__": diff --git a/Crawler/Main.py b/Crawler/Main.py deleted file mode 100644 index c778b1c..0000000 --- a/Crawler/Main.py +++ /dev/null @@ -1,81 +0,0 @@ -import Crawler.Spider as Spider -import time -import RegularExpressionParser.Parser as Parser -from DataBase import mongoDB as db - - -def scrape_api(url_api, book_num_api, author_num_api): - print('Request received, start scraping') - book_dict_api = {} - author_dict_api = {} - similar_book_dict_api = {} - count_api = 1 - - print("Crawling book #" + str(count_api)) - start_page_api = Spider.Spider(url_api) - start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) - - while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api): - all_related_books_api = list(similar_book_dict_api.items()) - for book_url_pair_api in all_related_books_api: - if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api): - break - book_name_api = book_url_pair_api[0] - if book_name_api in book_dict_api: - pass - else: - count_api += 1 - print("Crawling book #" + str(count_api)) - book_url_api = book_url_pair_api[1] - book_spider_api = Spider.Spider(book_url_api) - book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) - - # since the network delay from asia to goodread.com is already very high, - # a small amount of time of sleep is used here. - time.sleep(0.2) - - db.insert_dicts(book_dict_api, 0) - db.insert_dicts(author_dict_api, 1) - print("Scraping completed") - - -print('please enter URL of the book') -url = input() -print('please enter the number of books you want to crawl') -book_num = input() -print('please enter the number of authors you want to crawl') -author_num = input() -print('Request received, start scraping') - -book_dict = {} -author_dict = {} -similar_book_dict = {} -count = 1 - -print("Crawling book #" + str(count)) -start_page = Spider.Spider(url) -start_page.crawl(book_dict, author_dict, similar_book_dict) - - -while len(book_dict) < int(book_num) or len(author_dict) < int(author_num): - all_related_books = list(similar_book_dict.items()) - for book_url_pair in all_related_books: - if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num): - break - book_name = book_url_pair[0] - if book_name in book_dict: - pass - else: - count += 1 - print("Crawling book #" + str(count)) - book_url = book_url_pair[1] - book_spider = Spider.Spider(book_url) - book_spider.crawl(book_dict, author_dict, similar_book_dict) - - # since the network delay from asia to goodread.com is already very high, - # a small amount of time of sleep is used here. - time.sleep(0.2) - -db.insert_dicts(book_dict, 0) -db.insert_dicts(author_dict, 1) -print("Scraping completed") diff --git a/Crawler/Scrape.py b/Crawler/Scrape.py new file mode 100644 index 0000000..a2f97ae --- /dev/null +++ b/Crawler/Scrape.py @@ -0,0 +1,38 @@ +import Crawler.Spider as Spider +import time +from DataBase import mongoDB as db + + +def scrape_api(url_api, book_num_api, author_num_api): + print('Request received, start scraping') + book_dict_api = {} + author_dict_api = {} + similar_book_dict_api = {} + count_api = 1 + + print("Crawling book #" + str(count_api)) + start_page_api = Spider.Spider(url_api) + start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + while len(book_dict_api) < int(book_num_api) or len(author_dict_api) < int(author_num_api): + all_related_books_api = list(similar_book_dict_api.items()) + for book_url_pair_api in all_related_books_api: + if len(book_dict_api) >= int(book_num_api) and len(author_dict_api) >= int(author_num_api): + break + book_name_api = book_url_pair_api[0] + if book_name_api in book_dict_api: + pass + else: + count_api += 1 + print("Crawling book #" + str(count_api)) + book_url_api = book_url_pair_api[1] + book_spider_api = Spider.Spider(book_url_api) + book_spider_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api) + + # since the network delay from asia to goodread.com is already very high, + # a small amount of time of sleep is used here. + time.sleep(0.2) + + db.insert_dicts(book_dict_api, 0) + db.insert_dicts(author_dict_api, 1) + print("Scraping completed") \ No newline at end of file diff --git a/Crawler/Spider.py b/Crawler/Spider.py index a689789..bab2576 100644 --- a/Crawler/Spider.py +++ b/Crawler/Spider.py @@ -11,12 +11,21 @@ class Spider: self.soup = None def crawl(self, book_dict, author_dict, similar_book_dict): + """ + function used to scrape data of a certain book + :param book_dict: + :param author_dict: + :param similar_book_dict: + :return: none + """ header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} url = self.url res = requests.get(url, headers=header) self.soup = BeautifulSoup(res.text, "lxml") + + # parsing data of current book titleAndAuthor = self.soup.title.string.split(" by ") title = titleAndAuthor[0] bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0] @@ -36,6 +45,8 @@ class Spider: authorID = (author_page_link.split("show/")[1]).split(".")[0] see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"] res_similar = requests.get(see_more_link, headers=header) + + # parsing similar books of current book similar_soup = BeautifulSoup(res_similar.text, 'lxml') similar_books_and_authors = similar_soup.find_all("span", itemprop="name") similar_books = [] @@ -51,6 +62,8 @@ class Spider: bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count, review_count, imageURL, similar_books) book_dict[title] = bookPKT + + # parse data of the author of current book res_author = requests.get(author_page_link, headers=header) author_soup = BeautifulSoup(res_author.text, 'lxml') author_rating = float(author_soup.find("span", {"class": "average"}).text) diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index fc6f628..6743910 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -58,7 +58,7 @@ def parse_query_to_json(pair): else: # can be A.B: C or A.B: "C" if re.search(":", pair): - if re.search("^[0-9.]*$", elements[1]): + if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]): return {elements[0].split(".")[1]: float(elements[1])} else: if re.search("\".*\"", elements[1]): @@ -67,7 +67,7 @@ def parse_query_to_json(pair): return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}} else: if re.search("NOT", pair): - if re.search("^[0-9.]*$", elements[1]): + if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]): return {elements[0].split(".")[1]: {"$ne": float(elements[1])}} else: return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}} diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index ae10ffb..8f34109 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -1,6 +1,7 @@ import DataBase.mongoDB as DataBase import RegularExpressionParser.Parser as Parser import re +import Crawler.Scrape as Scrape from flask import Flask from flask import request from urllib.parse import urlparse, parse_qs @@ -20,21 +21,18 @@ def data_base(collection): if collection == "book": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - # print(qs_parsed["id"]) if qs_parsed == {}: return DataBase.get_documents_json(0, {}) return search_document(["book.id:" + qs_parsed["id"][0]]) elif collection == "author": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - # print(type(qs_parsed["id"])) if qs_parsed == {}: return DataBase.get_documents_json(1, {}) return search_document(["author.id:" + qs_parsed["id"][0]]) elif collection == "search": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - print(qs_parsed) return search_document(qs_parsed["q"][0].split("&")) else: return "404: Unknown Collection to GET" @@ -47,7 +45,7 @@ def data_base(collection): elif collection == "author": opt = 1 else: - return "404: Unknown Collection to GET" + return "404: Unknown Collection to PUT" DataBase.update_dicts(opt, request.args.to_dict(), json_update_info) return "200: PUT succeeded" elif request.method == "POST": @@ -63,7 +61,15 @@ def data_base(collection): elif collection == "author": DataBase.insert_document(json_file, 1) elif collection == "scrape": - return "200" + # url_parsed = urlparse(request.url) + # qs_parsed = parse_qs(url_parsed.query) + # print(qs_parsed) + param = request.args.to_dict() + url = param["url"] + max_book = param["max_book"] + max_author = param["max_author"] + Scrape.scrape_api(url, max_book, max_author) + return "201: new data has been added to database" else: return "404: Unknown Collection to POST" return "201: POST succeeded" @@ -81,6 +87,7 @@ def data_base(collection): def search_document(identifiers): + """ function used to find one or several document in database """ if len(identifiers) == 1: json_idt = Parser.parse_query_to_json(identifiers[0]) print(json_idt) @@ -96,7 +103,7 @@ def search_document(identifiers): else: opt = 0 else: - if re.search("^book.*",identifiers[2]): + if re.search("^book.*", identifiers[2]): print("Failed to find documentation: two statements are not pointing to the same collection") return {} else: -- GitLab From 0b8727418b7a440321684b3c8d28ef9957d547d7 Mon Sep 17 00:00:00 2001 From: HenryShan <zshan2@illinois.edu> Date: Tue, 9 Mar 2021 13:57:01 +0800 Subject: [PATCH 9/9] assignment2.1 final version --- Client/CLI.py | 8 ++++ DataBase/mongoDB.py | 20 +++++----- README.md | 15 +++++++- RegularExpressionParser/Parser.py | 48 +++++++++++------------ Server/SimpleServer.py | 37 ++++++++++-------- Tests/DataBaseTests.py | 53 +++++++++++++++++++++++++ Tests/ParserTests.py | 36 +++++++++++++++++ Tests/ServerTests.py | 64 +++++++++++++++++++++++++++++++ Tests/testData.json | 34 ++++++++++++++++ 9 files changed, 264 insertions(+), 51 deletions(-) create mode 100644 Tests/DataBaseTests.py create mode 100644 Tests/ParserTests.py create mode 100644 Tests/ServerTests.py create mode 100644 Tests/testData.json diff --git a/Client/CLI.py b/Client/CLI.py index 3bb87ef..071ffac 100644 --- a/Client/CLI.py +++ b/Client/CLI.py @@ -140,6 +140,14 @@ def put(): else: print("Unknown command") back_to_main_page() + if re.search("books", query): + if len(data) <= 1: + print("For posting multiple books, please input json file has more than 1 books\n") + main_page() + else: + if len(data) > 1: + print("Cannot post more than one book with 'post book/author', please use 'post books/authors'") + main_page() result = API.put(query, data) typer.echo(result) print() diff --git a/DataBase/mongoDB.py b/DataBase/mongoDB.py index 4095fa3..ef437fc 100644 --- a/DataBase/mongoDB.py +++ b/DataBase/mongoDB.py @@ -16,9 +16,9 @@ def get_db(): def insert_document(docu, opt): db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return @@ -34,9 +34,9 @@ def insert_dicts(dictionary, opt): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return @@ -58,9 +58,9 @@ def update_dicts(opt, identifier, content): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return @@ -82,9 +82,9 @@ def get_documents_json(opt, identifier): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return json.dumps({}) @@ -127,9 +127,9 @@ def clean(opt, identifier): """ db = get_db() if opt == 0: - records = db.books + records = db.test_books elif opt == 1: - records = db.authors + records = db.test_authors else: print("failed to get json file: wrong opt for selecting collection") return diff --git a/README.md b/README.md index 4d0dd40..52026e1 100644 --- a/README.md +++ b/README.md @@ -1 +1,14 @@ -## SP21-CS242 Assignment2 \ No newline at end of file +## SP21-CS242 Assignment2: GoodReads Scraper +This project is for sp21 CS242 assignment2 + +Current version is 2.1 + +###Function + 1. Scrap data of books and authors from GoodReads + 2. Store scraped data on cloud or local files + 3. Query cloud for certain documents of books or authors + +###Composition: +####Client: Command line interface (interactive) +####Server: Simple server based on Flask +####Database: MongoDB \ No newline at end of file diff --git a/RegularExpressionParser/Parser.py b/RegularExpressionParser/Parser.py index 6743910..e361d2f 100644 --- a/RegularExpressionParser/Parser.py +++ b/RegularExpressionParser/Parser.py @@ -15,27 +15,27 @@ def check_if_address_valid(address): def parse_query_to_url(query): - elements = re.findall("[0-9A-Za-z_\"><]+", query) + elements = re.findall("[0-9A-Za-z_\"><.]+", query) count = len(elements) - if count == 3: + if count == 2: # can only be A.B:C or wrong - if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2]) + if re.search("^[0-9a-zA-Z_.]+:[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + elements[1]) else: - print("Invalid query.") + print("Invalid query1.") return "" - elif count == 4: + elif count == 3: # a pair and one of [NOT, >, <]. - if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2]) - elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2]) - elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query): - return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2]) + if re.search("^[0-9a-zA-Z_.]+:\s*NOT\s*[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + "NOT" + elements[2]) + elif re.search("^[0-9a-zA-Z_.]+:\s*>\s*[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + ">" + elements[2]) + elif re.search("^[0-9a-zA-Z_.]+:\s*<\s*[0-9a-zA-Z_\".]", query): + return url_safe(elements[0] + "%3A" + "<" + elements[2]) else: - print("Invalid query.") + print("Invalid query2.") return "" - elif 6 <= count <= 8: + elif 5 <= count <= 7: # AND or OR operator if re.search(".*\sAND\s.*", query): parts = query.split(" AND ") @@ -44,17 +44,22 @@ def parse_query_to_url(query): parts = query.split(" OR ") return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1]) else: - print("Invalid query.") + print("Invalid query3.") return "" def parse_query_to_json(pair): - print(pair) elements = re.findall("[0-9A-Za-z\"_.]+", pair) count = len(elements) - if count != 2: + if count != 2 and count != 3: print("Failed to parse query: invalid args number") return {"wrong": "True"} # will never be founded in database + elif count == 3: + # A.B: NOT C + if re.search("^[0-9.]*$", elements[2]) and not re.search("id", elements[0]): + return {elements[0].split(".")[1]: {"$ne": float(elements[2])}} + else: + return {elements[0].split(".")[1]: {"$not": {"$regex": elements[2], "$options": "i"}}} else: # can be A.B: C or A.B: "C" if re.search(":", pair): @@ -66,12 +71,7 @@ def parse_query_to_json(pair): else: return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}} else: - if re.search("NOT", pair): - if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]): - return {elements[0].split(".")[1]: {"$ne": float(elements[1])}} - else: - return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}} - elif re.search(">", pair): + if re.search(">", pair): return {elements[0].split(".")[1]: {"$gt": float(elements[1])}} elif re.search("<", pair): return {elements[0].split(".")[1]: {"$lt": float(elements[1])}} @@ -81,4 +81,4 @@ def parse_query_to_json(pair): def url_safe(element): - return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C") \ No newline at end of file + return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C").replace("&", "%26").replace(":", "%3A") diff --git a/Server/SimpleServer.py b/Server/SimpleServer.py index 8f34109..6507c32 100644 --- a/Server/SimpleServer.py +++ b/Server/SimpleServer.py @@ -4,6 +4,8 @@ import re import Crawler.Scrape as Scrape from flask import Flask from flask import request +from flask import abort +from flask import jsonify from urllib.parse import urlparse, parse_qs app = Flask(__name__) @@ -12,45 +14,51 @@ app = Flask(__name__) @app.route("/", methods=['GET']) def home(): + """ homepage of server """ return "200: successfully connected to home page\n" @app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"]) def data_base(collection): + """ data base page of server """ + print("\n===============================\n") + print(collection) + print("\n===============================\n") if request.method == "GET": if collection == "book": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) if qs_parsed == {}: - return DataBase.get_documents_json(0, {}) - return search_document(["book.id:" + qs_parsed["id"][0]]) + return jsonify(DataBase.get_documents_json(0, {})) + return jsonify(search_document(["book.id:" + qs_parsed["id"][0]])) elif collection == "author": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) if qs_parsed == {}: - return DataBase.get_documents_json(1, {}) - return search_document(["author.id:" + qs_parsed["id"][0]]) + return jsonify(DataBase.get_documents_json(1, {})) + return jsonify(search_document(["author.id:" + qs_parsed["id"][0]])) elif collection == "search": url_parsed = urlparse(request.url) qs_parsed = parse_qs(url_parsed.query) - return search_document(qs_parsed["q"][0].split("&")) + result = jsonify(search_document(qs_parsed["q"][0].split("&"))) + return jsonify(result) else: - return "404: Unknown Collection to GET" + abort(404) elif request.method == "PUT": if request.headers["Content-Type"] != "application/json": - return "415: content should be JSON file" + abort(415) json_update_info = request.json if collection == "book": opt = 0 elif collection == "author": opt = 1 else: - return "404: Unknown Collection to PUT" + abort(404) DataBase.update_dicts(opt, request.args.to_dict(), json_update_info) return "200: PUT succeeded" elif request.method == "POST": if request.headers["Content-Type"] != "application/json": - return "415: content should be JSON file" + abort(415, "content should be JSON file") json_file = request.json if collection == "books": DataBase.insert_dicts(json_file, 0) @@ -61,18 +69,15 @@ def data_base(collection): elif collection == "author": DataBase.insert_document(json_file, 1) elif collection == "scrape": - # url_parsed = urlparse(request.url) - # qs_parsed = parse_qs(url_parsed.query) - # print(qs_parsed) param = request.args.to_dict() url = param["url"] max_book = param["max_book"] max_author = param["max_author"] Scrape.scrape_api(url, max_book, max_author) - return "201: new data has been added to database" + return "200: new data has been added to database" else: - return "404: Unknown Collection to POST" - return "201: POST succeeded" + abort(404) + return "200: POST succeeded" elif request.method == "DELETE": identifier = request.args.to_dict() print(identifier) @@ -81,7 +86,7 @@ def data_base(collection): elif collection == "author": opt = 1 else: - return "404: Unknown Collection to DELETE" + abort(404, "Unknown Collection to DELETE") DataBase.clean(opt, identifier) return "200: DELETE succeeded" diff --git a/Tests/DataBaseTests.py b/Tests/DataBaseTests.py new file mode 100644 index 0000000..0173f8d --- /dev/null +++ b/Tests/DataBaseTests.py @@ -0,0 +1,53 @@ +import unittest +import os +import DataBase.mongoDB as db +import json +from pymongo import MongoClient +from dotenv import load_dotenv + + +class DataBaseTests(unittest.TestCase): + + def setUp(self): + file_path = os.path.dirname(__file__) + r"\testData.json" + with open(file_path) as file: + self.test_data = json.load(file) + self.test_data1 = self.test_data["test_data1"] + + def tearDown(self): + data_base = db.get_db() + record = data_base.test_books + record.delete_many({}) + + def test_upload(self): + data_base = db.get_db() + record = data_base.test_books + db.insert_document(self.test_data1, 0) + self.assertEqual(record.count_documents({}), 1) + + def test_download(self): + db.insert_document(self.test_data1, 0) + json_file = json.loads(db.get_documents_json(0, {"title": "Becoming"})) + no_id_test_data1 = self.test_data1 + no_id_test_data1.pop("_id") + self.assertEqual(json_file["books"][0], no_id_test_data1) + + def test_update(self): + data_base = db.get_db() + record = data_base.test_books + db.insert_document(self.test_data1, 0) + db.update_dicts(0, {"title": "Becoming"}, {"rating_count": 1000000}) + self.assertEqual(record.count_documents({"rating_count": self.test_data1["rating_count"]}), 0) + self.assertEqual(record.count_documents({"rating_count": 1000000}), 1) + + def test_delete(self): + data_base = db.get_db() + record = data_base.test_books + db.insert_document(self.test_data1, 0) + db.clean(0, {}) + self.assertEqual(record.count_documents({}), 0) + + +if __name__ == '__main__': + unittest.main() + # db.clean(0, {}) diff --git a/Tests/ParserTests.py b/Tests/ParserTests.py new file mode 100644 index 0000000..a46915d --- /dev/null +++ b/Tests/ParserTests.py @@ -0,0 +1,36 @@ +import RegularExpressionParser.Parser as Parser +import unittest + + +class DataBaseTests(unittest.TestCase): + + def test_safe_url(self): + str_query = "search?q=book.rating_count<1000000&AND&book.rating>4.4" + expected_url = "http://127.0.0.1:5000/api/search?q=book.rating_count%3C1000000%26AND%26book.rating%3E4.4" + output_url = "http://127.0.0.1:5000/api/" + Parser.url_safe(str_query) + self.assertEqual(output_url, expected_url) + + def test_valid_goodreads_url(self): + url = "https://www.goodreads.com/book/show/35133922-educated" + self.assertTrue(Parser.check_if_address_valid(url)) + + def test_invalid_goodreads_url(self): + url1 = "https://www.google.com/book/show/35133922-educated" + self.assertFalse(Parser.check_if_address_valid(url1)) + url2 = "https://www.goodreads.com/book/show/over-35133922-educated" + self.assertFalse(Parser.check_if_address_valid(url2)) + url3 = "https://www.goodreads.com/book/show/educated" + self.assertFalse(Parser.check_if_address_valid(url3)) + + def test_parse_query_to_url(self): + query = "book.id:12345678 AND book.rating: > 4.80" + result_url = Parser.parse_query_to_url(query) + expect_url = "book.id%3A12345678%26AND%26book.rating%3A%3E4.80" + self.assertEqual(result_url, expect_url) + + def test_parse_query_to_json(self): + query = "book.id: NOT 12345678" + expect_json = {"id": {"$not": {"$regex": "12345678", "$options": "i"}}} + output_json = Parser.parse_query_to_json(query) + self.assertEqual(output_json, expect_json) + diff --git a/Tests/ServerTests.py b/Tests/ServerTests.py new file mode 100644 index 0000000..2bad5db --- /dev/null +++ b/Tests/ServerTests.py @@ -0,0 +1,64 @@ +import unittest +import os +import json +import DataBase.mongoDB as db +import requests +import Server.SimpleServer + + +class DataBaseTests(unittest.TestCase): + def setUp(self): + file_path = os.path.dirname(__file__) + r"\testData.json" + with open(file_path) as file: + self.test_data = json.load(file) + self.test_data1 = self.test_data["test_data1"] + + def tearDown(self): + data_base = db.get_db() + record = data_base.test_books + record.delete_many({}) + + def test_valid_get(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/book?id=38746485" + res = requests.get(url) + self.assertEqual(res.status_code, 200) + + def test_search(self): + db.insert_document(self.test_data1, 0) + url1 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485" + res1 = requests.get(url1) + self.assertEqual(res1.status_code, 200) + url2 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485%26book.rating%3E4.90" + res2 = requests.get(url2) + self.assertEqual(res2.json(), {}) + + def test_invalid_get_not_exist(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/book?id=12345678" + res = requests.get(url) + self.assertEqual(res.status_code, 200) + self.assertEqual(res.json(), {'books': []}) + + def test_invalid_get_wrong_collection_name(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/bookssss?id=38746485" + res = requests.get(url) + self.assertEqual(res.status_code, 200) + + def test_valid_put(self): + db.insert_document(self.test_data1, 0) + url = "http://127.0.0.1:5000/api/book?id=38746485" + update_info = {"rating_count": 1000000} + res = requests.put(url, json=update_info) + self.assertEqual(res.status_code, 200) + + def test_insert_put(self): + url = "http://127.0.0.1:5000/api/book?id=38746485" + update_info = {"rating_count": 1000000} + res = requests.put(url, json=update_info) + self.assertEqual(res.status_code, 200) + + +if __name__ == '__main__': + unittest.main() diff --git a/Tests/testData.json b/Tests/testData.json new file mode 100644 index 0000000..5e7dc0b --- /dev/null +++ b/Tests/testData.json @@ -0,0 +1,34 @@ +{ + "test_data1": {"type": "book", + "book_url": "https://www.goodreads.com/book/show/38746485-becoming", + "title": "Becoming", + "id": "38746485", + "ISBN": "0", + "author_url": "https://www.goodreads.com/author/show/2338628.Michelle_Obama", + "author": "Michelle Obama", + "rating": 4.52, + "rating_count": 661305, + "review_count": 54516, + "image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1528206996l/38746485.jpg", + "similar_books": ["Becoming", + "Educated", + "Born a Crime: Stories From a South African Childhood", + "More Than Love, A Husband's Tale", + "I Am Malala: The Story of the Girl Who Stood Up for Education and Was Shot by the Taliban", + "Untamed", + "Where the Crawdads Sing", + "A Promised Land", + "Dreams from My Father: A Story of Race and Inheritance", + "Drum Beats, Heart Beats (Dancing Soul Trilogy, #3)", + "Eat, Pray, Love", + "Losing Jon: A Teen's Tragic Death, a Police Cover-Up, a Community's Fight for Justice", + "I Know Why the Caged Bird Sings (Maya Angelou's Autobiography, #1)", + "The Vanishing Half", + "Self Made: Inspired by the Life of Madam C.J. Walker", + "Bossypants", + "The Audacity of Hope: Thoughts on Reclaiming the American Dream", + "Such a Fun Age", + "Between the World and Me", + "Black Fortunes: The Story of the First Six African Americans Who Escaped Slavery and Became Millionaires", + "Little Fires Everywhere"]} +} \ No newline at end of file -- GitLab