Recently I was looking for a quick way to explore some apache log file. I didn't want to set up any software and I wanted to analyze some very precise path for a specific user, or what happen after a specific error. So I thought about OpenRefine and its parsing capabilities.
The recipe doesn't replace an analytical tool to understand your traffic but help to go behind the curtain and drill down to analyze specific IP address or user, type of error code and patterns
First download your log file on your machine and create a new project. You will need to parse the file as Line-based text file.
Once we have our project load up, we need to parse the different information in their field. Apache log comes as one line per log. The Apache documentation will help us to find the different part of the log.
All the split are done using the option Edit column > Split into several columns ...
The steps 1 to 8 are summarized in the following recipe. You can copy the code provided below and reapply it to your project to gain time (under Undo/Redo > Apply)
Now you can use the filter text search function to explore your log. I will recommand the reading of this article: Use Refine to explore and profile your data (facet, filter, flag)
The recipe doesn't replace an analytical tool to understand your traffic but help to go behind the curtain and drill down to analyze specific IP address or user, type of error code and patterns
First download your log file on your machine and create a new project. You will need to parse the file as Line-based text file.
Once we have our project load up, we need to parse the different information in their field. Apache log comes as one line per log. The Apache documentation will help us to find the different part of the log.
All the split are done using the option Edit column > Split into several columns ...
- Split the Column 1 on a dash (-) to separate the IP address with a max of two columns
- Split the Column 1 2 on square left bracket (]) to separate the username and the address
- Split the Column 1 2 1 on square right bracket ([) to separate the username and the timestamp
- Split the Column 1 2 2 on the dash (-) to generate all the six other columns
- Split the Column 1 2 2 3 on a space to separate the answer time from the object size
- Using the re-order column option removes all unnecessary columns
- Rename all the column one by one using the option Edit column > Rename this column
- Convert the timetamp field into a time format. In this case we manually replace each month name by its number before changing the format toDate() and defining the date format.
value.replace('Jan','01').replace('Feb','02').replace('Mar','03').replace('Apr','04').replace('May','05').replace('Jun','06').replace('Jui','07').replace('Aug','08').replace('Sep','09').replace('Oct','10').replace('Nov','11').replace('Dec','12').toDate('dd/MM/yyyy:hh:mm:ss') - You can further parse the User-Agent HTTP request header field to extract each information.
The steps 1 to 8 are summarized in the following recipe. You can copy the code provided below and reapply it to your project to gain time (under Undo/Redo > Apply)
Now you can use the filter text search function to explore your log. I will recommand the reading of this article: Use Refine to explore and profile your data (facet, filter, flag)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"op": "core/column-split", | |
"description": "Split column Column 1 by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"columnName": "Column 1", | |
"guessCellType": true, | |
"removeOriginalColumn": true, | |
"mode": "separator", | |
"separator": "-", | |
"regex": false, | |
"maxColumns": 2 | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column Column 1 2 by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"columnName": "Column 1 2", | |
"guessCellType": true, | |
"removeOriginalColumn": true, | |
"mode": "separator", | |
"separator": "]", | |
"regex": false, | |
"maxColumns": 2 | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column Column 1 2 1 by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"columnName": "Column 1 2 1", | |
"guessCellType": true, | |
"removeOriginalColumn": true, | |
"mode": "separator", | |
"separator": "[", | |
"regex": false, | |
"maxColumns": 2 | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column Column 1 2 2 by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"columnName": "Column 1 2 2", | |
"guessCellType": true, | |
"removeOriginalColumn": true, | |
"mode": "separator", | |
"separator": "\"", | |
"regex": false, | |
"maxColumns": 0 | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column Column 1 2 2 3 by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"columnName": "Column 1 2 2 3", | |
"guessCellType": true, | |
"removeOriginalColumn": true, | |
"mode": "separator", | |
"separator": " ", | |
"regex": false, | |
"maxColumns": 0 | |
}, | |
{ | |
"op": "core/column-reorder", | |
"description": "Reorder columns", | |
"columnNames": [ | |
"Column 1 1", | |
"Column 1 2 1 1", | |
"Column 1 2 1 2", | |
"Column 1 2 2 2", | |
"Column 1 2 2 3 2", | |
"Column 1 2 2 3 3", | |
"Column 1 2 2 4", | |
"Column 1 2 2 6" | |
] | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 1 to client IP address", | |
"oldColumnName": "Column 1 1", | |
"newColumnName": " client IP address" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 2 1 1 to htaccess userid", | |
"oldColumnName": "Column 1 2 1 1", | |
"newColumnName": "htaccess userid" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 2 1 2 to timestamp", | |
"oldColumnName": "Column 1 2 1 2", | |
"newColumnName": "timestamp" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 2 2 2 to request line", | |
"oldColumnName": "Column 1 2 2 2", | |
"newColumnName": "request line" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 2 2 3 3 to object size", | |
"oldColumnName": "Column 1 2 2 3 3", | |
"newColumnName": "object size" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 2 2 4 to HTTP request header", | |
"oldColumnName": "Column 1 2 2 4", | |
"newColumnName": "HTTP request header" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 2 2 3 2 to Status Code", | |
"oldColumnName": "Column 1 2 2 3 2", | |
"newColumnName": "Status Code" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column Column 1 2 2 6 to User-Agent HTTP request header", | |
"oldColumnName": "Column 1 2 2 6", | |
"newColumnName": "User-Agent HTTP request header" | |
}, | |
{ | |
"op": "core/text-transform", | |
"description": "Text transform on cells in column timestamp using expression grel:value.replace('Jan','01').replace('Feb','02').replace('Mar','03').replace('Apr','04').replace('May','05').replace('Jun','06').replace('Jui','07').replace('Aug','08').replace('Sep','09').replace('Oct','10').replace('Nov','11').replace('Dec','12').toDate('dd/MM/yyyy:hh:mm:ss')", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"columnName": "timestamp", | |
"expression": "grel:value.replace('Jan','01').replace('Feb','02').replace('Mar','03').replace('Apr','04').replace('May','05').replace('Jun','06').replace('Jui','07').replace('Aug','08').replace('Sep','09').replace('Oct','10').replace('Nov','11').replace('Dec','12').toDate('dd/MM/yyyy:hh:mm:ss')", | |
"onError": "keep-original", | |
"repeat": false, | |
"repeatCount": 10 | |
} | |
] |