package
com.hive.myudf;
import
java.net.URL;
import
java.util.regex.Matcher;
import
java.util.regex.Pattern;
import
org.apache.hadoop.hive.ql.exec.UDF;
import
org.apache.hadoop.io.Text;
import
org.apache.log4j.Logger;
public
class
UDFNginxParseUrl
extends
UDF {
private
static
final
Logger LOG = Logger.getLogger(UDFNginxParseUrl.
class
);
private
Text schemal =
new
Text(
"http://"
);
private
Pattern p1 =
null
;
private
URL url =
null
;
private
Pattern p =
null
;
private
Text lastKey =
null
;
private
String rt;
public
UDFNginxParseUrl() {
}
public
Text evaluate(Text host1, Text urlStr, Text partToExtract) {
LOG.debug(
"3args|args1:"
+ host1 +
",args2:"
+ urlStr +
",args3:"
+ partToExtract);
System. out.println(
"3 args"
);
System. out.println(
"args1:"
+ host1 +
",args2:"
+ urlStr +
",args3:"
+ partToExtract);
if
(host1 ==
null
|| urlStr ==
null
|| partToExtract ==
null
) {
return
new
Text(
"a"
);
}
p1 = Pattern.compile(
"(.+?) +(.+?) (.+)"
);
Matcher m1 = p1.matcher(urlStr.toString());
if
(m1.matches()){
LOG.debug(
"into match"
);
String realUrl = schemal.toString() + host1.toString() + m1.group(
2
);
Text realUrl1 =
new
Text(realUrl);
System. out.println(
"URL is "
+ realUrl1);
LOG.debug(
"realurl:"
+ realUrl1.toString());
try
{
LOG.debug(
"into try"
);
url =
new
URL(realUrl1.toString());
}
catch
(Exception e){
LOG.debug(
"into exception"
);
return
new
Text(
"b"
);
}
}
if
(partToExtract.equals(
"HOST"
)) {
rt = url.getHost();
LOG.debug(
"get host"
+ rt );
}
LOG.debug(
"get what"
);
return
new
Text(
"rt"
);
}
}