Skip to content
Snippets Groups Projects
Parser.py 2.93 KiB
Newer Older
  • Learn to ignore specific revisions
  • import re
    
    
    def check_if_address_valid(address):
        """
        Take input address and scrape date
        :param addr:
        :return:
        """
        x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address)
        if x:
            return True
        else:
            return False
    
    
    def parse_query_to_url(query):
        elements = re.findall("[0-9A-Za-z_\"><]+", query)
        count = len(elements)
        if count == 3:
            # can only be A.B:C or wrong
            if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:[0-9a-zA-Z_\"]", query):
                return url_safe(elements[0] + "." + elements[1] + "%3A" + elements[2])
            else:
                print("Invalid query.")
                return ""
        elif count == 4:
            # a pair and one of [NOT, >, <].
            if re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\sNOT\s[0-9a-zA-Z_\"]", query):
                return url_safe(elements[0] + "." + elements[1] + "%3A" + "NOT" + elements[2])
            elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s>\s[0-9a-zA-Z_\"]", query):
                return url_safe(elements[0] + "." + elements[1] + "%3A" + ">" + elements[2])
            elif re.search("^[0-9a-zA-Z_]+\.[0-9a-zA-Z_]:\s<\s[0-9a-zA-Z_\"]", query):
                return url_safe(elements[0] + "." + elements[1] + "%3A" + "<" + elements[2])
            else:
                print("Invalid query.")
                return ""
        elif 6 <= count <= 8:
            # AND or OR operator
            if re.search(".*\sAND\s.*", query):
                parts = query.split(" AND ")
                return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1])
            elif re.search(".*\sOR\s.*", query):
                parts = query.split(" OR ")
                return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
            else:
                print("Invalid query.")
                return ""
    
    
    def parse_query_to_json(pair):
        elements = re.findall("[0-9A-Za-z\"._]", pair)
        count = len(elements)
        print(count)
        if count != 3 and count != 4:
            print("Failed to parse query: invalid args number")
            return {"wrong": "True"}            # will never be founded in database
        elif count == 3:
            # can be A.B: C or A.B: "C"
            if re.search("\".*\"", elements[2]):
                return {elements[0] + "." + elements[1]: elements[2]}
            else:
                return {elements[0] + "." + elements[1]: {"$regex": elements[2], "$options": "i"}}
        else:
            # can be NOT, >, or <
            if elements[2] == "NOT":
                return {elements[0] + "." + elements[1]: {"$not": {"$regex": elements[3], "$options": "i"}}}
            elif elements[2] == ">":
                return {elements[0] + "." + elements[1]: {"$gt": float(elements[3])}}
            elif elements[2] == "<":
                return {elements[0] + "." + elements[1]: {"$lt": float(elements[3])}}
            else:
                print("Failed to parse query: unknown operator")
                return {"wrong": "True"}
    
    
    def url_safe(element):
        return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C")