diff --git a/build.sbt b/build.sbt index 1799d84..7f8a542 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ name := "ScalaApacheAccessLogParser" version := "1.0" -scalaVersion := "2.10.0" +scalaVersion := "2.10.4" resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" diff --git a/src/main/scala/AccessLogParser.scala b/src/main/scala/AccessLogParser.scala index 473082e..11f123b 100644 --- a/src/main/scala/AccessLogParser.scala +++ b/src/main/scala/AccessLogParser.scala @@ -17,9 +17,23 @@ import scala.util.{Try, Success, Failure} * */ + /** + * For record like: + * 94.102.63.11 - - [21/Jul/2009:02:48:13 -0700] "GET / HTTP/1.1" 200 18209 "http://acme.com/foo.php" "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" + * it will add '-' in the beginning of the log line. Botnet is '-' + * + * + * For records like: + * Expiro 5.102.63.11 - - [3/Jan/2014:10:06:55 +0000] "GET /?f=x HTTP/1.1" 200 3594 "http://www.foo.it/foo.php" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E; InfoPath.2)" + * Botnet is 'Expiro' + */ + + + @SerialVersionUID(100L) class AccessLogParser extends Serializable { + private val bot = "(\\S+)" // like 'Expiro' private val ddd = "\\d{1,3}" // at least 1 but not more than 3 times (possessive) private val ip = s"($ddd\\.$ddd\\.$ddd\\.$ddd)?" // like `123.456.7.89` private val client = "(\\S+)" // '\S' is 'non-whitespace character' @@ -30,7 +44,7 @@ class AccessLogParser extends Serializable { private val bytes = "(\\S+)" // this can be a "-" private val referer = "\"(.*?)\"" private val agent = "\"(.*?)\"" - private val regex = s"$ip $client $user $dateTime $request $status $bytes $referer $agent" + private val regex = s"$bot $ip $client $user $dateTime $request $status $bytes $referer $agent" private val p = Pattern.compile(regex) /** @@ -39,7 +53,10 @@ class AccessLogParser extends Serializable { * @return An AccessLogRecord instance wrapped in an Option. */ def parseRecord(record: String): Option[AccessLogRecord] = { - val matcher = p.matcher(record) + val isbot = "(\\D+)".r + val check_record = isbot findFirstIn record + val matcher = if(check_record == Some(".")) p.matcher("- " + record) + else p.matcher(record) if (matcher.find) { Some(buildAccessLogRecord(matcher)) } else { @@ -57,7 +74,10 @@ class AccessLogParser extends Serializable { * will be empty strings. */ def parseRecordReturningNullObjectOnFailure(record: String): AccessLogRecord = { - val matcher = p.matcher(record) + val isbot = "(\\D+)".r + val check_record = isbot findFirstIn record + val matcher = if(check_record == Some(".")) p.matcher("- " + record) + else p.matcher(record) if (matcher.find) { buildAccessLogRecord(matcher) } else { @@ -75,7 +95,8 @@ class AccessLogParser extends Serializable { matcher.group(6), matcher.group(7), matcher.group(8), - matcher.group(9)) + matcher.group(9), + matcher.group(10)) } } @@ -85,7 +106,7 @@ class AccessLogParser extends Serializable { */ object AccessLogParser { - val nullObjectAccessLogRecord = AccessLogRecord("", "", "", "", "", "", "", "", "") + val nullObjectAccessLogRecord = AccessLogRecord("", "", "", "", "", "", "", "", "", "") /** * @param A String like "GET /the-uri-here HTTP/1.1" diff --git a/src/main/scala/AccessLogRecord.scala b/src/main/scala/AccessLogRecord.scala index f592fa0..9715b5a 100644 --- a/src/main/scala/AccessLogRecord.scala +++ b/src/main/scala/AccessLogRecord.scala @@ -4,6 +4,7 @@ package com.alvinalexander.accesslogparser * @see http://httpd.apache.org/docs/2.2/logs.html for details */ case class AccessLogRecord ( + botnet: String, // string or empty clientIpAddress: String, // should be an ip address, but may also be the hostname if hostname-lookups are enabled rfc1413ClientIdentity: String, // typically `-` remoteUser: String, // typically `-` diff --git a/src/test/scala/AccessLogRecordSpec.scala b/src/test/scala/AccessLogRecordSpec.scala index 8d6f0e2..bca6893 100644 --- a/src/test/scala/AccessLogRecordSpec.scala +++ b/src/test/scala/AccessLogRecordSpec.scala @@ -20,6 +20,7 @@ class ApacheCombinedAccessLogRecordSpec extends FunSpec with BeforeAndAfter with val parser = new AccessLogParser val rec = parser.parseRecord(records(0)) println("IP ADDRESS: " + rec.get.clientIpAddress) + println("BOTNET: " + rec.get.botnet) Then("parsing record(0) should not return None") assert(rec != None) And("the ip address should be correct") @@ -42,6 +43,39 @@ class ApacheCombinedAccessLogRecordSpec extends FunSpec with BeforeAndAfter with assert(rec.get.userAgent == "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 GTB5") } } + + describe("Testing the access log record with botnet ...") { + it("the data fields should be correct") { + Given("the first sample log record") + records = SampleCombinedAccessLogRecords.botnetRecord + val parser = new AccessLogParser + val rec = parser.parseRecord(records(0)) + println("IP ADDRESS: " + rec.get.clientIpAddress) + println("BOTNET: " + rec.get.botnet) + Then("parsing record(0) should not return None") + assert(rec != None) + And("botnet") + assert(rec.get.botnet == "Expiro") + And("the ip address should be correct") + assert(rec.get.clientIpAddress == "5.102.63.11") + And("client identity") + assert(rec.get.rfc1413ClientIdentity == "-") + And("remote user") + assert(rec.get.remoteUser == "-") + And("date/time") + assert(rec.get.dateTime == "[31/Jan/2014:10:06:55 +0000]") + And("request") + assert(rec.get.request == "GET /?f=x HTTP/1.1") + And("status code should be 200") + assert(rec.get.httpStatusCode == "200") + And("bytes sent should be 3594") + assert(rec.get.bytesSent == "3594") + And("referer") + assert(rec.get.referer == "http://www.foo.it/foo.php") + And("user agent") + assert(rec.get.userAgent == "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E; InfoPath.2)") + } + } describe("Testing a second access log record ...") { records = SampleCombinedAccessLogRecords.data diff --git a/src/test/scala/SampleData.scala b/src/test/scala/SampleData.scala index 20bfa97..c795e0d 100644 --- a/src/test/scala/SampleData.scala +++ b/src/test/scala/SampleData.scala @@ -16,6 +16,12 @@ object SampleCombinedAccessLogRecords { 66.249.70.10 - - [23/Feb/2014:03:21:59 -0700] "GET /blog/post/java/how-load-multiple-spring-context-files-standalone/ HTTP/1.0" 301 - "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" """.split("\n").filter(_ != "") + val botnetRecord = """ + Expiro 5.102.63.11 - - [31/Jan/2014:10:06:55 +0000] "GET /?f=x HTTP/1.1" 200 3594 "http://www.foo.it/foo.php" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E; InfoPath.2)" + Pandora 5.102.63.11 - - [31/Jan/2014:10:06:55 +0000] "GET /?f=x HTTP/1.1" 200 3594 "http://www.foo.it/foo.php" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E; InfoPath.2)" + FakeM 5.102.63.11 - - [31/Jan/2014:10:06:55 +0000] "GET /?f=x HTTP/1.1" 200 3594 "http://www.foo.it/foo.php" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E; InfoPath.2)" + Qakbot 5.102.63.11 - - [31/Jan/2014:10:06:55 +0000] "GET /?f=x HTTP/1.1" 200 3594 "http://www.foo.it/foo.php" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E; InfoPath.2)" + """.split("\n").filter(_ !="") }