Skip to content
Snippets Groups Projects
Commit 0b872741 authored by zshan2's avatar zshan2
Browse files

assignment2.1 final version

parent 155ae6b1
No related branches found
No related tags found
1 merge request!1<assignment-2.1>
......@@ -140,6 +140,14 @@ def put():
else:
print("Unknown command")
back_to_main_page()
if re.search("books", query):
if len(data) <= 1:
print("For posting multiple books, please input json file has more than 1 books\n")
main_page()
else:
if len(data) > 1:
print("Cannot post more than one book with 'post book/author', please use 'post books/authors'")
main_page()
result = API.put(query, data)
typer.echo(result)
print()
......
......@@ -16,9 +16,9 @@ def get_db():
def insert_document(docu, opt):
db = get_db()
if opt == 0:
records = db.books
records = db.test_books
elif opt == 1:
records = db.authors
records = db.test_authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
......@@ -34,9 +34,9 @@ def insert_dicts(dictionary, opt):
"""
db = get_db()
if opt == 0:
records = db.books
records = db.test_books
elif opt == 1:
records = db.authors
records = db.test_authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
......@@ -58,9 +58,9 @@ def update_dicts(opt, identifier, content):
"""
db = get_db()
if opt == 0:
records = db.books
records = db.test_books
elif opt == 1:
records = db.authors
records = db.test_authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
......@@ -82,9 +82,9 @@ def get_documents_json(opt, identifier):
"""
db = get_db()
if opt == 0:
records = db.books
records = db.test_books
elif opt == 1:
records = db.authors
records = db.test_authors
else:
print("failed to get json file: wrong opt for selecting collection")
return json.dumps({})
......@@ -127,9 +127,9 @@ def clean(opt, identifier):
"""
db = get_db()
if opt == 0:
records = db.books
records = db.test_books
elif opt == 1:
records = db.authors
records = db.test_authors
else:
print("failed to get json file: wrong opt for selecting collection")
return
......
## SP21-CS242 Assignment2
\ No newline at end of file
## SP21-CS242 Assignment2: GoodReads Scraper
This project is for sp21 CS242 assignment2
Current version is 2.1
###Function
1. Scrap data of books and authors from GoodReads
2. Store scraped data on cloud or local files
3. Query cloud for certain documents of books or authors
###Composition:
####Client: Command line interface (interactive)
####Server: Simple server based on Flask
####Database: MongoDB
\ No newline at end of file
......@@ -15,27 +15,27 @@ def check_if_address_valid(address):
def parse_query_to_url(query):
elements = re.findall("[0-9A-Za-z_\"><]+", query)
elements = re.findall("[0-9A-Za-z_\"><.]+", query)
count = len(elements)
if count == 3:
if count == 2:
# can only be A.B:C or wrong
if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
if re.search("^[0-9a-zA-Z_.]+:[0-9a-zA-Z_\".]", query):
return url_safe(elements[0] + "%3A" + elements[1])
else:
print("Invalid query.")
print("Invalid query1.")
return ""
elif count == 4:
elif count == 3:
# a pair and one of [NOT, >, <].
if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
if re.search("^[0-9a-zA-Z_.]+:\s*NOT\s*[0-9a-zA-Z_\".]", query):
return url_safe(elements[0] + "%3A" + "NOT" + elements[2])
elif re.search("^[0-9a-zA-Z_.]+:\s*>\s*[0-9a-zA-Z_\".]", query):
return url_safe(elements[0] + "%3A" + ">" + elements[2])
elif re.search("^[0-9a-zA-Z_.]+:\s*<\s*[0-9a-zA-Z_\".]", query):
return url_safe(elements[0] + "%3A" + "<" + elements[2])
else:
print("Invalid query.")
print("Invalid query2.")
return ""
elif 6 <= count <= 8:
elif 5 <= count <= 7:
# AND or OR operator
if re.search(".*\sAND\s.*", query):
parts = query.split(" AND ")
......@@ -44,17 +44,22 @@ def parse_query_to_url(query):
parts = query.split(" OR ")
return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
else:
print("Invalid query.")
print("Invalid query3.")
return ""
def parse_query_to_json(pair):
print(pair)
elements = re.findall("[0-9A-Za-z\"_.]+", pair)
count = len(elements)
if count != 2:
if count != 2 and count != 3:
print("Failed to parse query: invalid args number")
return {"wrong": "True"} # will never be founded in database
elif count == 3:
# A.B: NOT C
if re.search("^[0-9.]*$", elements[2]) and not re.search("id", elements[0]):
return {elements[0].split(".")[1]: {"$ne": float(elements[2])}}
else:
return {elements[0].split(".")[1]: {"$not": {"$regex": elements[2], "$options": "i"}}}
else:
# can be A.B: C or A.B: "C"
if re.search(":", pair):
......@@ -66,12 +71,7 @@ def parse_query_to_json(pair):
else:
return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
else:
if re.search("NOT", pair):
if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
return {elements[0].split(".")[1]: {"$ne": float(elements[1])}}
else:
return {elements[0].split(".")[1]: {"$not": {"$regex": elements[1], "$options": "i"}}}
elif re.search(">", pair):
if re.search(">", pair):
return {elements[0].split(".")[1]: {"$gt": float(elements[1])}}
elif re.search("<", pair):
return {elements[0].split(".")[1]: {"$lt": float(elements[1])}}
......@@ -81,4 +81,4 @@ def parse_query_to_json(pair):
def url_safe(element):
return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")
\ No newline at end of file
return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C").replace("&", "%26").replace(":", "%3A")
......@@ -4,6 +4,8 @@ import re
import Crawler.Scrape as Scrape
from flask import Flask
from flask import request
from flask import abort
from flask import jsonify
from urllib.parse import urlparse, parse_qs
app = Flask(__name__)
......@@ -12,45 +14,51 @@ app = Flask(__name__)
@app.route("/", methods=['GET'])
def home():
""" homepage of server """
return "200: successfully connected to home page\n"
@app.route("/api/<collection>/", methods=["GET", "PUT", "POST", "DELETE"])
def data_base(collection):
""" data base page of server """
print("\n===============================\n")
print(collection)
print("\n===============================\n")
if request.method == "GET":
if collection == "book":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
if qs_parsed == {}:
return DataBase.get_documents_json(0, {})
return search_document(["book.id:" + qs_parsed["id"][0]])
return jsonify(DataBase.get_documents_json(0, {}))
return jsonify(search_document(["book.id:" + qs_parsed["id"][0]]))
elif collection == "author":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
if qs_parsed == {}:
return DataBase.get_documents_json(1, {})
return search_document(["author.id:" + qs_parsed["id"][0]])
return jsonify(DataBase.get_documents_json(1, {}))
return jsonify(search_document(["author.id:" + qs_parsed["id"][0]]))
elif collection == "search":
url_parsed = urlparse(request.url)
qs_parsed = parse_qs(url_parsed.query)
return search_document(qs_parsed["q"][0].split("&"))
result = jsonify(search_document(qs_parsed["q"][0].split("&")))
return jsonify(result)
else:
return "404: Unknown Collection to GET"
abort(404)
elif request.method == "PUT":
if request.headers["Content-Type"] != "application/json":
return "415: content should be JSON file"
abort(415)
json_update_info = request.json
if collection == "book":
opt = 0
elif collection == "author":
opt = 1
else:
return "404: Unknown Collection to PUT"
abort(404)
DataBase.update_dicts(opt, request.args.to_dict(), json_update_info)
return "200: PUT succeeded"
elif request.method == "POST":
if request.headers["Content-Type"] != "application/json":
return "415: content should be JSON file"
abort(415, "content should be JSON file")
json_file = request.json
if collection == "books":
DataBase.insert_dicts(json_file, 0)
......@@ -61,18 +69,15 @@ def data_base(collection):
elif collection == "author":
DataBase.insert_document(json_file, 1)
elif collection == "scrape":
# url_parsed = urlparse(request.url)
# qs_parsed = parse_qs(url_parsed.query)
# print(qs_parsed)
param = request.args.to_dict()
url = param["url"]
max_book = param["max_book"]
max_author = param["max_author"]
Scrape.scrape_api(url, max_book, max_author)
return "201: new data has been added to database"
return "200: new data has been added to database"
else:
return "404: Unknown Collection to POST"
return "201: POST succeeded"
abort(404)
return "200: POST succeeded"
elif request.method == "DELETE":
identifier = request.args.to_dict()
print(identifier)
......@@ -81,7 +86,7 @@ def data_base(collection):
elif collection == "author":
opt = 1
else:
return "404: Unknown Collection to DELETE"
abort(404, "Unknown Collection to DELETE")
DataBase.clean(opt, identifier)
return "200: DELETE succeeded"
......
import unittest
import os
import DataBase.mongoDB as db
import json
from pymongo import MongoClient
from dotenv import load_dotenv
class DataBaseTests(unittest.TestCase):
def setUp(self):
file_path = os.path.dirname(__file__) + r"\testData.json"
with open(file_path) as file:
self.test_data = json.load(file)
self.test_data1 = self.test_data["test_data1"]
def tearDown(self):
data_base = db.get_db()
record = data_base.test_books
record.delete_many({})
def test_upload(self):
data_base = db.get_db()
record = data_base.test_books
db.insert_document(self.test_data1, 0)
self.assertEqual(record.count_documents({}), 1)
def test_download(self):
db.insert_document(self.test_data1, 0)
json_file = json.loads(db.get_documents_json(0, {"title": "Becoming"}))
no_id_test_data1 = self.test_data1
no_id_test_data1.pop("_id")
self.assertEqual(json_file["books"][0], no_id_test_data1)
def test_update(self):
data_base = db.get_db()
record = data_base.test_books
db.insert_document(self.test_data1, 0)
db.update_dicts(0, {"title": "Becoming"}, {"rating_count": 1000000})
self.assertEqual(record.count_documents({"rating_count": self.test_data1["rating_count"]}), 0)
self.assertEqual(record.count_documents({"rating_count": 1000000}), 1)
def test_delete(self):
data_base = db.get_db()
record = data_base.test_books
db.insert_document(self.test_data1, 0)
db.clean(0, {})
self.assertEqual(record.count_documents({}), 0)
if __name__ == '__main__':
unittest.main()
# db.clean(0, {})
import RegularExpressionParser.Parser as Parser
import unittest
class DataBaseTests(unittest.TestCase):
def test_safe_url(self):
str_query = "search?q=book.rating_count<1000000&AND&book.rating>4.4"
expected_url = "http://127.0.0.1:5000/api/search?q=book.rating_count%3C1000000%26AND%26book.rating%3E4.4"
output_url = "http://127.0.0.1:5000/api/" + Parser.url_safe(str_query)
self.assertEqual(output_url, expected_url)
def test_valid_goodreads_url(self):
url = "https://www.goodreads.com/book/show/35133922-educated"
self.assertTrue(Parser.check_if_address_valid(url))
def test_invalid_goodreads_url(self):
url1 = "https://www.google.com/book/show/35133922-educated"
self.assertFalse(Parser.check_if_address_valid(url1))
url2 = "https://www.goodreads.com/book/show/over-35133922-educated"
self.assertFalse(Parser.check_if_address_valid(url2))
url3 = "https://www.goodreads.com/book/show/educated"
self.assertFalse(Parser.check_if_address_valid(url3))
def test_parse_query_to_url(self):
query = "book.id:12345678 AND book.rating: > 4.80"
result_url = Parser.parse_query_to_url(query)
expect_url = "book.id%3A12345678%26AND%26book.rating%3A%3E4.80"
self.assertEqual(result_url, expect_url)
def test_parse_query_to_json(self):
query = "book.id: NOT 12345678"
expect_json = {"id": {"$not": {"$regex": "12345678", "$options": "i"}}}
output_json = Parser.parse_query_to_json(query)
self.assertEqual(output_json, expect_json)
import unittest
import os
import json
import DataBase.mongoDB as db
import requests
import Server.SimpleServer
class DataBaseTests(unittest.TestCase):
def setUp(self):
file_path = os.path.dirname(__file__) + r"\testData.json"
with open(file_path) as file:
self.test_data = json.load(file)
self.test_data1 = self.test_data["test_data1"]
def tearDown(self):
data_base = db.get_db()
record = data_base.test_books
record.delete_many({})
def test_valid_get(self):
db.insert_document(self.test_data1, 0)
url = "http://127.0.0.1:5000/api/book?id=38746485"
res = requests.get(url)
self.assertEqual(res.status_code, 200)
def test_search(self):
db.insert_document(self.test_data1, 0)
url1 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485"
res1 = requests.get(url1)
self.assertEqual(res1.status_code, 200)
url2 = "http://127.0.0.1:5000/api/search?q=book.id%3A38746485%26book.rating%3E4.90"
res2 = requests.get(url2)
self.assertEqual(res2.json(), {})
def test_invalid_get_not_exist(self):
db.insert_document(self.test_data1, 0)
url = "http://127.0.0.1:5000/api/book?id=12345678"
res = requests.get(url)
self.assertEqual(res.status_code, 200)
self.assertEqual(res.json(), {'books': []})
def test_invalid_get_wrong_collection_name(self):
db.insert_document(self.test_data1, 0)
url = "http://127.0.0.1:5000/api/bookssss?id=38746485"
res = requests.get(url)
self.assertEqual(res.status_code, 200)
def test_valid_put(self):
db.insert_document(self.test_data1, 0)
url = "http://127.0.0.1:5000/api/book?id=38746485"
update_info = {"rating_count": 1000000}
res = requests.put(url, json=update_info)
self.assertEqual(res.status_code, 200)
def test_insert_put(self):
url = "http://127.0.0.1:5000/api/book?id=38746485"
update_info = {"rating_count": 1000000}
res = requests.put(url, json=update_info)
self.assertEqual(res.status_code, 200)
if __name__ == '__main__':
unittest.main()
{
"test_data1": {"type": "book",
"book_url": "https://www.goodreads.com/book/show/38746485-becoming",
"title": "Becoming",
"id": "38746485",
"ISBN": "0",
"author_url": "https://www.goodreads.com/author/show/2338628.Michelle_Obama",
"author": "Michelle Obama",
"rating": 4.52,
"rating_count": 661305,
"review_count": 54516,
"image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1528206996l/38746485.jpg",
"similar_books": ["Becoming",
"Educated",
"Born a Crime: Stories From a South African Childhood",
"More Than Love, A Husband's Tale",
"I Am Malala: The Story of the Girl Who Stood Up for Education and Was Shot by the Taliban",
"Untamed",
"Where the Crawdads Sing",
"A Promised Land",
"Dreams from My Father: A Story of Race and Inheritance",
"Drum Beats, Heart Beats (Dancing Soul Trilogy, #3)",
"Eat, Pray, Love",
"Losing Jon: A Teen's Tragic Death, a Police Cover-Up, a Community's Fight for Justice",
"I Know Why the Caged Bird Sings (Maya Angelou's Autobiography, #1)",
"The Vanishing Half",
"Self Made: Inspired by the Life of Madam C.J. Walker",
"Bossypants",
"The Audacity of Hope: Thoughts on Reclaiming the American Dream",
"Such a Fun Age",
"Between the World and Me",
"Black Fortunes: The Story of the First Six African Americans Who Escaped Slavery and Became Millionaires",
"Little Fires Everywhere"]}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment