Skip to content
Snippets Groups Projects
Commit 155ae6b1 authored by zshan2's avatar zshan2
Browse files

assignment2.1 version3: part3 finished, start testing

parent a3565f17
No related branches found
No related tags found
1 merge request!1<assignment-2.1>
......@@ -19,7 +19,7 @@ def get(query):
print("GET: " + query)
url = safe_url(query)
req = requests.get(url)
print("response code: " + str(req.status_code))
print("\nresponse code: " + str(req.status_code) + "\n")
return req.json()
......@@ -27,15 +27,15 @@ def put(query, json_file):
print("PUT: " + query)
url = safe_url(query)
req = requests.put(url, json=json_file)
print("response code: " + str(req.status_code))
print("\nresponse code: " + str(req.status_code) + "\n")
return req.text
def post(query, json_file):
print("POST: " + query)
url = safe_url(query)
req = requests.put(url, json=json_file)
print("response code: " + str(req.status_code))
req = requests.post(url, json=json_file)
print("\nresponse code: " + str(req.status_code) + "\n")
return req.text
......@@ -43,7 +43,7 @@ def delete(query):
print("DELETE: " + query)
url = safe_url(query)
req = requests.delete(url)
print("response code: " + str(req.status_code))
print("\nresponse code: " + str(req.status_code) + "\n")
return req.text
import typer
import RegularExpressionParser.Parser as Parser
import Crawler.Main as Main
import Crawler.Scrape as Scrape
import API.Functions as API
import os
import json
import re
from dotenv import load_dotenv
app = typer.Typer()
......@@ -11,20 +12,24 @@ app = typer.Typer()
@app.command()
def main_page():
print("Welcome to use goodReads scraper!\n"
"This is main page\n"
"Usage:\n"
"scrape: scrape goodRead and load data to database\n"
"get: find a document for a book or an author from database\n"
"put: create or update a document in database\n"
"post: add one or multiple documents of book or author in database \n"
"delete: delete certain document from database\n"
"export: export a certain collection from database\n"
"exit: end the program")
"""
main page of client, directly run this .py to enter and see usage
:return: none
"""
print("==================================================================\n\n"
"Welcome to use goodReads scraper!\n\n"
"This is main page\n\n"
"You can input back to return to this page at anytime\n\n"
"Usage:\n\n"
"get: find a document for a book or an author from database\n\n"
"put: create or update a document in database\n\n"
"post: add one or multiple documents of book or author in database \n\n"
"delete: delete certain document from database\n\n"
"export: export a certain collection from database\n\n"
"exit: end the program\n\n"
"Please enter your command:\n")
order = input()
if order == "scrape":
scrape()
elif order == "get":
if order == "get":
get()
elif order == "put":
put()
......@@ -34,152 +39,186 @@ def main_page():
delete()
elif order == "export":
export()
elif order == "exit":
if order == "exit":
exit()
else:
print("Unknown command")
main_page()
def scrape():
def check_and_back(command):
""" check if used in put back and return to main page if so """
if command == "back":
main_page()
def back_to_main_page():
""" wait for user to press enter and then return to the main page """
print("press enter to return to main page")
no_use = input()
main_page()
def scrape(query):
"""
Function used to create scrape request to Server
Only for assignment 2.0, for GET scrape, find it in GET in SimplerServer.py
"""
print('please enter URL of the book')
url = input()
print('please enter the number of books you want to crawl')
book_num = input()
attributes = query.split("?")[1].split("&")
print(attributes)
url = attributes[0].split("=", 1)[1]
book_num = attributes[1].split("=")[1]
if not book_num.isdigit():
print("max book num must be an integer")
main_page()
back_to_main_page()
book_num = int(book_num)
print('please enter the number of authors you want to crawl')
author_num = input()
author_num = attributes[2].split("=")[1]
if not author_num.isdigit():
print("max author num must be an integer")
main_page()
back_to_main_page()
author_num = int(author_num)
print('Request received, start scraping')
if not Parser.check_if_address_valid(url):
print("Error: invalid URL")
main_page()
elif book_num >= 2000 or author_num >= 2000:
print("Error, the number of book and author should be lower than 2000")
main_page()
back_to_main_page()
elif book_num >= 2000 or author_num >= 500 or book_num <= 0 or author_num <= 0:
print("Error, the number of book and author should be positive and lower than 2000/500")
back_to_main_page()
else:
if book_num > 200 or author_num > 50:
print("Warning, maximum of the number of books or authors is large")
Main.scrape_api(url, book_num, author_num)
main_page()
result = API.post(query, {})
return result
def get():
print("Please enter your query:")
""" GET API """
print("==================================================================\n")
print("Please enter your query:\n")
query = input()
check_and_back(query)
result = API.get(query)
typer.echo(result)
main_page()
print()
back_to_main_page()
def put():
print("Please enter your query:")
""" PUT API """
print("==================================================================\n")
print("Please enter your query:\n")
query = input()
print("Please select one way to load update info: input or read:")
check_and_back(query)
print("Please select one way to load update info: input or read:\n")
load = input()
check_and_back(load)
if load == "input":
print("Please enter json data:")
print("Please enter json data:\n")
string = input()
check_and_back(string)
try:
data = json.loads(string)
except ValueError as err:
print(err)
main_page()
back_to_main_page()
elif load == "read":
load_dotenv()
file_root = os.getenv('FILE_ROOT')
print("Please enter the name of file which is in the file root directory:")
file_path = r"" + file_root + input()
print("Please enter the name of file which is in the file root directory:\n")
name = input()
check_and_back(name)
file_path = r"" + file_root + name
if os.path.isfile(file_path):
with open(file_path, "r") as file:
try:
data = json.load(file)
except ValueError as err:
print(err)
main_page()
back_to_main_page()
else:
print("Error: file not exist")
main_page()
back_to_main_page()
else:
print("Unknown command")
main_page()
back_to_main_page()
result = API.put(query, data)
typer.echo(result)
main_page()
print()
back_to_main_page()
def post():
print("Please enter your query:")
""" POST API """
print("==================================================================\n")
print("Please enter your query:\n")
query = input()
load_dotenv()
file_root = os.getenv('FILE_ROOT')
print("Please enter the name of file which is in the file root directory:")
file_path = r"" + file_root + input()
if os.path.isfile(file_path):
with open(file_path, "r") as file:
try:
data = json.load(file)
except ValueError as err:
print(err)
main_page()
check_and_back(query)
if re.search("scrape", query):
result = scrape(query)
typer.echo(result)
print()
back_to_main_page()
else:
print("Error: file not exist")
main_page()
result = API.post(query, data)
typer.echo(result)
main_page()
load_dotenv()
file_root = os.getenv('FILE_ROOT')
print("Please enter the name of file which is in the file root directory:\n")
name = input()
check_and_back(name)
file_path = r"" + file_root + name
if os.path.isfile(file_path):
with open(file_path, "r") as file:
try:
data = json.load(file)
except ValueError as err:
print(err)
back_to_main_page()
else:
print("Error: file not exist")
back_to_main_page()
result = API.post(query, data)
typer.echo(result)
print()
back_to_main_page()
def delete():
print("Please enter your query:")
""" DELETE API """
print("==================================================================\n")
print("Please enter your query:\n")
query = input()
check_and_back(query)
result = API.delete(query)
typer.echo(result)
main_page()
print()
back_to_main_page()
def export():
print("Which collection do you want to export? books or authors?")
""" export the book or author collection """
print("==================================================================\n")
print("Which collection do you want to export: books or authors?\n")
collection = input()
check_and_back(collection)
if collection == "books":
json = API.get("book")
json_file = API.get("book")
load_dotenv()
file_root = os.getenv('FILE_ROOT')
with open(file_root + "books" + ".json", "w") as output:
output.write(json)
output.write(json_file)
print("books.json is stored at " + file_root)
main_page()
back_to_main_page()
elif collection == "authors":
json = API.get("author")
json_file = API.get("author")
load_dotenv()
file_root = os.getenv('FILE_ROOT')
with open(file_root + "authors" + ".json", "w") as output:
output.write(json)
output.write(json_file)
print("authors.json is stored at " + file_root)
main_page()
back_to_main_page()
elif collection == "back":
main_page()
back_to_main_page()
else:
print("Unknown collection")
main_page()
# def goodbye(name: str, formal: bool = False):
# """ function2 """
# if formal:
# typer.echo(f"Goodbye Ms. {name}. Have a good day.")
# else:
# typer.echo(f"Bye {name}!")
back_to_main_page()
if __name__ == "__main__":
......
import Crawler.Spider as Spider
import time
import RegularExpressionParser.Parser as Parser
from DataBase import mongoDB as db
......@@ -15,10 +14,10 @@ def scrape_api(url_api, book_num_api, author_num_api):
start_page_api = Spider.Spider(url_api)
start_page_api.crawl(book_dict_api, author_dict_api, similar_book_dict_api)
while len(book_dict_api) < int(book_num_api) and len(author_dict_api) < int(author_num_api):
while len(book_dict_api) < int(book_num_api) or len(author_dict_api) < int(author_num_api):
all_related_books_api = list(similar_book_dict_api.items())
for book_url_pair_api in all_related_books_api:
if len(book_dict_api) >= int(book_num_api) or len(author_dict_api) >= int(author_num_api):
if len(book_dict_api) >= int(book_num_api) and len(author_dict_api) >= int(author_num_api):
break
book_name_api = book_url_pair_api[0]
if book_name_api in book_dict_api:
......@@ -36,46 +35,4 @@ def scrape_api(url_api, book_num_api, author_num_api):
db.insert_dicts(book_dict_api, 0)
db.insert_dicts(author_dict_api, 1)
print("Scraping completed")
print('please enter URL of the book')
url = input()
print('please enter the number of books you want to crawl')
book_num = input()
print('please enter the number of authors you want to crawl')
author_num = input()
print('Request received, start scraping')
book_dict = {}
author_dict = {}
similar_book_dict = {}
count = 1
print("Crawling book #" + str(count))
start_page = Spider.Spider(url)
start_page.crawl(book_dict, author_dict, similar_book_dict)
while len(book_dict) < int(book_num) or len(author_dict) < int(author_num):
all_related_books = list(similar_book_dict.items())
for book_url_pair in all_related_books:
if len(book_dict) >= int(book_num) and len(author_dict) >= int(author_num):
break
book_name = book_url_pair[0]
if book_name in book_dict:
pass
else:
count += 1
print("Crawling book #" + str(count))
book_url = book_url_pair[1]
book_spider = Spider.Spider(book_url)
book_spider.crawl(book_dict, author_dict, similar_book_dict)
# since the network delay from asia to goodread.com is already very high,
# a small amount of time of sleep is used here.
time.sleep(0.2)
db.insert_dicts(book_dict, 0)
db.insert_dicts(author_dict, 1)
print("Scraping completed")
print("Scraping completed")
\ No newline at end of file
......@@ -11,12 +11,21 @@ class Spider:
self.soup = None
def crawl(self, book_dict, author_dict, similar_book_dict):
"""
function used to scrape data of a certain book
:param book_dict:
:param author_dict:
:param similar_book_dict:
:return: none
"""
header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = self.url
res = requests.get(url, headers=header)
self.soup = BeautifulSoup(res.text, "lxml")
# parsing data of current book
titleAndAuthor = self.soup.title.string.split(" by ")
title = titleAndAuthor[0]
bookID = ((self.url.split("show/")[1]).split("-")[0]).split(".")[0]
......@@ -36,6 +45,8 @@ class Spider:
authorID = (author_page_link.split("show/")[1]).split(".")[0]
see_more_link = self.soup.find("a", {"class": "actionLink right seeMoreLink"})["href"]
res_similar = requests.get(see_more_link, headers=header)
# parsing similar books of current book
similar_soup = BeautifulSoup(res_similar.text, 'lxml')
similar_books_and_authors = similar_soup.find_all("span", itemprop="name")
similar_books = []
......@@ -51,6 +62,8 @@ class Spider:
bookPKT = BookPack.BookPackage(url, title, bookID, ISBN13, author_page_link, author, rating, rating_count,
review_count, imageURL, similar_books)
book_dict[title] = bookPKT
# parse data of the author of current book
res_author = requests.get(author_page_link, headers=header)
author_soup = BeautifulSoup(res_author.text, 'lxml')
author_rating = float(author_soup.find("span", {"class": "average"}).text)
......
......@@ -58,7 +58,7 @@ def parse_query_to_json(pair):
else:
# can be A.B: C or A.B: "C"
if re.search(":", pair):
if re.search("^[0-9.]*$", elements[1]):
if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
return {elements[0].split(".")[1]: float(elements[1])}
else:
if re.search("\".*\"", elements[1]):
......@@ -67,7 +67,7 @@ def parse_query_to_json(pair):
return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
else:
if re.search("NOT", pair):
if re.search("^[0-9.]*$", elements[1]):
if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
else:
return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
......
import DataBase.mongoDB as DataBase
import RegularExpressionParser.Parser as Parser
import re
import Crawler.Scrape as Scrape
from flask import Flask
from flask import request
from urllib.parse import urlparse, parse_qs
......@@ -20,21 +21,18 @@ def data_base(collection):
if collection == "book":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
# print(qs_parsed["id"])
if qs_parsed == {}:
return DataBase.get_documents_json(0, {})
return search_document(["book.id:" + qs_parsed["id"][0]])
elif collection == "author":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
# print(type(qs_parsed["id"]))
if qs_parsed == {}:
return DataBase.get_documents_json(1, {})
return search_document(["author.id:" + qs_parsed["id"][0]])
elif collection == "search":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
print(qs_parsed)
return search_document(qs_parsed["q"][0].split("&"))
else:
return "404: Unknown Collection to GET"
......@@ -47,7 +45,7 @@ def data_base(collection):
elif collection == "author":
opt = 1
else:
return "404: Unknown Collection to GET"
return "404: Unknown Collection to PUT"
DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
return "200: PUT succeeded"
elif request.method == "POST":
......@@ -63,7 +61,15 @@ def data_base(collection):
elif collection == "author":
DataBase.insert_document(json_file, 1)
elif collection == "scrape":
return "200"
# url_parsed = urlparse(request.url)
# qs_parsed = parse_qs(url_parsed.query)
# print(qs_parsed)
param = request.args.to_dict()
url = param["url"]
max_book = param["max_book"]
max_author = param["max_author"]
Scrape.scrape_api(url, max_book, max_author)
return "201: new data has been added to database"
else:
return "404: Unknown Collection to POST"
return "201: POST succeeded"
......@@ -81,6 +87,7 @@ def data_base(collection):
def search_document(identifiers):
""" function used to find one or several document in database """
if len(identifiers) == 1:
json_idt = Parser.parse_query_to_json(identifiers[0])
print(json_idt)
......@@ -96,7 +103,7 @@ def search_document(identifiers):
else:
opt = 0
else:
if re.search("^book.*",identifiers[2]):
if re.search("^book.*", identifiers[2]):
print("Failed to find documentation: two statements are not pointing to the same collection")
return {}
else:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment