Skip to content
Snippets Groups Projects
Parser.py 3.22 KiB
Newer Older
  • Learn to ignore specific revisions
  • import re
    
    
    def check_if_address_valid(address):
        """
        Take input address and scrape date
        :param addr:
        :return:
        """
        x = re.search("^https://www.goodreads.com/book/show/[0-9]+", address)
        if x:
            return True
        else:
            return False
    
    
    def parse_query_to_url(query):
    
    zshan2's avatar
    zshan2 committed
        elements = re.findall("[0-9A-Za-z_\"><.]+", query)
    
        count = len(elements)
    
    zshan2's avatar
    zshan2 committed
        if count == 2:
    
            # can only be A.B:C or wrong
    
    zshan2's avatar
    zshan2 committed
            if re.search("^[0-9a-zA-Z_.]+:[0-9a-zA-Z_\".]", query):
                return url_safe(elements[0] + "%3A" + elements[1])
    
    zshan2's avatar
    zshan2 committed
                print("Invalid query1.")
    
                return ""
    
    zshan2's avatar
    zshan2 committed
        elif count == 3:
    
            # a pair and one of [NOT, >, <].
    
    zshan2's avatar
    zshan2 committed
            if re.search("^[0-9a-zA-Z_.]+:\s*NOT\s*[0-9a-zA-Z_\".]", query):
                return url_safe(elements[0] + "%3A" + "NOT" + elements[2])
            elif re.search("^[0-9a-zA-Z_.]+:\s*>\s*[0-9a-zA-Z_\".]", query):
                return url_safe(elements[0] + "%3A" + ">" + elements[2])
            elif re.search("^[0-9a-zA-Z_.]+:\s*<\s*[0-9a-zA-Z_\".]", query):
                return url_safe(elements[0] + "%3A" + "<" + elements[2])
    
    zshan2's avatar
    zshan2 committed
                print("Invalid query2.")
    
                return ""
    
    zshan2's avatar
    zshan2 committed
        elif 5 <= count <= 7:
    
            # AND or OR operator
            if re.search(".*\sAND\s.*", query):
                parts = query.split(" AND ")
                return parse_query_to_url(parts[0]) + "%26AND%26" + parse_query_to_url(parts[1])
            elif re.search(".*\sOR\s.*", query):
                parts = query.split(" OR ")
                return parse_query_to_url(parts[0]) + "%26OR%26" + parse_query_to_url(parts[1])
            else:
    
    zshan2's avatar
    zshan2 committed
                print("Invalid query3.")
    
                return ""
    
    
    def parse_query_to_json(pair):
    
        elements = re.findall("[0-9A-Za-z\"_.]+", pair)
    
        count = len(elements)
    
    zshan2's avatar
    zshan2 committed
        if count != 2 and count != 3:
    
            print("Failed to parse query: invalid args number")
            return {"wrong": "True"}            # will never be founded in database
    
    zshan2's avatar
    zshan2 committed
        elif count == 3:
            # A.B: NOT C
            if re.search("^[0-9.]*$", elements[2]) and not re.search("id", elements[0]):
                return {elements[0].split(".")[1]: {"$ne": float(elements[2])}}
            else:
                return {elements[0].split(".")[1]: {"$not": {"$regex": elements[2], "$options": "i"}}}
    
            # can be A.B: C or A.B: "C"
            if re.search(":", pair):
    
                if re.search("^[0-9.]*$", elements[1]) and not re.search("id", elements[0]):
    
                    return {elements[0].split(".")[1]: float(elements[1])}
                else:
                    if re.search("\".*\"", elements[1]):
                        return {elements[0].split(".")[1]: elements[1]}
                    else:
                        return {elements[0].split(".")[1]: {"$regex": elements[1], "$options": "i"}}
    
    zshan2's avatar
    zshan2 committed
                if re.search(">", pair):
    
                    return {elements[0].split(".")[1]: {"$gt": float(elements[1])}}
                elif re.search("<", pair):
                    return {elements[0].split(".")[1]: {"$lt": float(elements[1])}}
                else:
                    print("Failed to parse query: unknown operator")
                    return {"wrong": "True"}
    
    
    
    def url_safe(element):
    
    zshan2's avatar
    zshan2 committed
        return element.replace("\"", "%22").replace(">", "%3E").replace("<", "%3C").replace("&", "%26").replace(":", "%3A")