diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index fc13845a7f6c23355928e65d4a7753fed60df0c9..a8c23a8b0c536dbc893c9d849230b33d35777c11 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.expressions -import java.net.{MalformedURLException, URL} +import java.net.{URI, URISyntaxException} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{HashMap, Locale, Map => JMap} import java.util.regex.Pattern @@ -749,25 +749,44 @@ case class ParseUrl(children: Seq[Expression]) Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX) } - private def getUrl(url: UTF8String): URL = { + private def getUrl(url: UTF8String): URI = { try { - new URL(url.toString) + new URI(url.toString) } catch { - case e: MalformedURLException => null + case e: URISyntaxException => null } } - private def getExtractPartFunc(partToExtract: UTF8String): URL => String = { + private def getExtractPartFunc(partToExtract: UTF8String): URI => String = { + + // partToExtract match { + // case HOST => _.toURL().getHost + // case PATH => _.toURL().getPath + // case QUERY => _.toURL().getQuery + // case REF => _.toURL().getRef + // case PROTOCOL => _.toURL().getProtocol + // case FILE => _.toURL().getFile + // case AUTHORITY => _.toURL().getAuthority + // case USERINFO => _.toURL().getUserInfo + // case _ => (url: URI) => null + // } + partToExtract match { case HOST => _.getHost - case PATH => _.getPath - case QUERY => _.getQuery - case REF => _.getRef - case PROTOCOL => _.getProtocol - case FILE => _.getFile - case AUTHORITY => _.getAuthority - case USERINFO => _.getUserInfo - case _ => (url: URL) => null + case PATH => _.getRawPath + case QUERY => _.getRawQuery + case REF => _.getRawFragment + case PROTOCOL => _.getScheme + case FILE => + (url: URI) => + if (url.getRawQuery ne null) { + url.getRawPath + "?" + url.getRawQuery + } else { + url.getRawPath + } + case AUTHORITY => _.getRawAuthority + case USERINFO => _.getRawUserInfo + case _ => (url: URI) => null } } @@ -780,7 +799,7 @@ case class ParseUrl(children: Seq[Expression]) } } - private def extractFromUrl(url: URL, partToExtract: UTF8String): UTF8String = { + private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = { if (cachedExtractPartFunc ne null) { UTF8String.fromString(cachedExtractPartFunc.apply(url)) } else { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 524926e1e9b66830da87395705c587ed2e04887f..57ca5d9c4d7cf19c8b04be339cbbe5f44701222e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -229,18 +229,48 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext { } test("string parse_url function") { - val df = Seq[String](("http://userinfo@spark.apache.org/path?query=1#Ref")) - .toDF("url") - checkAnswer( - df.selectExpr( + def testUrl(url: String, expected: Row) { + checkAnswer(Seq[String]((url)).toDF("url").selectExpr( "parse_url(url, 'HOST')", "parse_url(url, 'PATH')", "parse_url(url, 'QUERY')", "parse_url(url, 'REF')", "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')", "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')", - "parse_url(url, 'QUERY', 'query')"), + "parse_url(url, 'QUERY', 'query')"), expected) + } + + testUrl( + "http://userinfo@spark.apache.org/path?query=1#Ref", Row("spark.apache.org", "/path", "query=1", "Ref", "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1")) + + testUrl( + "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two", + Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two", + "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com", + "use%20r:pas%20s", "x%20y")) + + testUrl( + "http://user:pass@host", + Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/", + Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/?#", + Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/file;param?query;p2", + Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2", + "user:pass@host", "user:pass", null)) + + testUrl( + "inva lid://user:pass@host/file;param?query;p2", + Row(null, null, null, null, null, null, null, null, null)) + } test("string repeat function") {