From 159b29b80b801ae0ab58bdd189a84858afd87ac5 Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 15:56:32 -0700 Subject: [PATCH 1/8] Replace spaces with tabs to make indentation consistent within the file. --- gravity.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gravity.sh b/gravity.sh index f1e70a63..e2126693 100755 --- a/gravity.sh +++ b/gravity.sh @@ -80,11 +80,11 @@ find $origin/ -type f -name "*.$justDomainsExtension" -exec cat {} \; | tr -d '\ # Append blacklist entries if they exist if [[ -f $blacklist ]];then - numberOf=$(cat $blacklist | sed '/^\s*$/d' | wc -l) - echo "** Blacklisting $numberOf domain(s)..." - cat $blacklist >> $origin/$matter + numberOf=$(cat $blacklist | sed '/^\s*$/d' | wc -l) + echo "** Blacklisting $numberOf domain(s)..." + cat $blacklist >> $origin/$matter else - : + : fi function gravity_advanced() From 0ec6eab683c02254ca32331c8aeb532e54097ee7 Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 16:04:54 -0700 Subject: [PATCH 2/8] Appending ".$justDomainsExtension" to $saveLocation variable. Every use of $saveLocation was adding this and making lines longer. --- gravity.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gravity.sh b/gravity.sh index e2126693..058e72e7 100755 --- a/gravity.sh +++ b/gravity.sh @@ -51,15 +51,15 @@ do domain=$(echo "${sources[$i]}" | cut -d'/' -f3) # Save the file as list.#.domain - saveLocation=$origin/"list"."$i"."$domain" + saveLocation=$origin/list.$i.$domain.$justDomainsExtension # Use a case statement to download lists that need special cURL commands to complete properly case "$domain" in - "adblock.mahakala.is") data=$(curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' -e http://forum.xda-developers.com/ -z $saveLocation."$justDomainsExtension" "${sources[$i]}");; + "adblock.mahakala.is") data=$(curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' -e http://forum.xda-developers.com/ -z $saveLocation "${sources[$i]}");; - "pgl.yoyo.org") data=$(curl -s -d mimetype=plaintext -d hostformat=hosts -z $saveLocation."$justDomainsExtension" "${sources[$i]}");; + "pgl.yoyo.org") data=$(curl -s -d mimetype=plaintext -d hostformat=hosts -z $saveLocation "${sources[$i]}");; - *) data=$(curl -s -z $saveLocation."$justDomainsExtension" -A "Mozilla/10.0" "${sources[$i]}");; + *) data=$(curl -s -z $saveLocation -A "Mozilla/10.0" "${sources[$i]}");; esac if [[ -n "$data" ]];then @@ -68,7 +68,7 @@ do # Most of the lists downloaded are already in hosts file format but the spacing/formating is not contigious # This helps with that and makes it easier to read # It also helps with debugging so each stage of the script can be researched more in depth - echo "$data" | awk 'NF {if ($1 !~ "#") { if (NF>1) {print $2} else {print $1}}}' > $saveLocation."$justDomainsExtension" + echo "$data" | awk 'NF {if ($1 !~ "#") { if (NF>1) {print $2} else {print $1}}}' > $saveLocation else echo "Skipping $domain list because it does not have any new entries..." fi From d6d192cb0a1a2adc9f2be31388d4e47c2f196728 Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 16:22:07 -0700 Subject: [PATCH 3/8] Use `url` variable to store `${sources[$i]}` value to improve readability. I also wanted to replace the for loop iterating over indices with something like: `for url in $sources[@]}` It made the use of `$i` in the save location more annoying though. --- gravity.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gravity.sh b/gravity.sh index 058e72e7..934581e8 100755 --- a/gravity.sh +++ b/gravity.sh @@ -47,19 +47,20 @@ fi # Loop through domain list. Download each one and remove commented lines (lines beginning with '# 'or '/') and blank lines for ((i = 0; i < "${#sources[@]}"; i++)) do + url=${sources[$i]} # Get just the domain from the URL - domain=$(echo "${sources[$i]}" | cut -d'/' -f3) + domain=$(echo "$url" | cut -d'/' -f3) # Save the file as list.#.domain saveLocation=$origin/list.$i.$domain.$justDomainsExtension # Use a case statement to download lists that need special cURL commands to complete properly case "$domain" in - "adblock.mahakala.is") data=$(curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' -e http://forum.xda-developers.com/ -z $saveLocation "${sources[$i]}");; + "adblock.mahakala.is") data=$(curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' -e http://forum.xda-developers.com/ -z $saveLocation $url);; - "pgl.yoyo.org") data=$(curl -s -d mimetype=plaintext -d hostformat=hosts -z $saveLocation "${sources[$i]}");; + "pgl.yoyo.org") data=$(curl -s -d mimetype=plaintext -d hostformat=hosts -z $saveLocation $url);; - *) data=$(curl -s -z $saveLocation -A "Mozilla/10.0" "${sources[$i]}");; + *) data=$(curl -s -z $saveLocation -A "Mozilla/10.0" $url);; esac if [[ -n "$data" ]];then From 1f29d01694a293685feacd7abb4a2639e5cbcc6f Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 17:05:19 -0700 Subject: [PATCH 4/8] Remove leading and trailing whitespace and `.` characters and duplicate `.` characters as each list is stored. Should fix #32. --- gravity.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gravity.sh b/gravity.sh index 934581e8..3dc9b8df 100755 --- a/gravity.sh +++ b/gravity.sh @@ -69,7 +69,8 @@ do # Most of the lists downloaded are already in hosts file format but the spacing/formating is not contigious # This helps with that and makes it easier to read # It also helps with debugging so each stage of the script can be researched more in depth - echo "$data" | awk 'NF {if ($1 !~ "#") { if (NF>1) {print $2} else {print $1}}}' > $saveLocation + echo "$data" | awk 'NF {if ($1 !~ "#") { if (NF>1) {print $2} else {print $1}}}' | \ + sed -e 's/^[. \t]*//' -e 's/\.\.\+/./g' -e 's/[. \t]*$//' > $saveLocation else echo "Skipping $domain list because it does not have any new entries..." fi From bb7db1121465ede2909e3ee79244ad884a3cb372 Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 17:33:30 -0700 Subject: [PATCH 5/8] Changing printouts when updating sources to tell what is going on when manually running gravity.sh This will print "Getting $domain list... " for each domain, followed by either "Done" if data was received and validated, or "Skipping list because it does not have any new entries" if no updates were needed. --- gravity.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gravity.sh b/gravity.sh index 3dc9b8df..c6cd6b29 100755 --- a/gravity.sh +++ b/gravity.sh @@ -54,6 +54,7 @@ do # Save the file as list.#.domain saveLocation=$origin/list.$i.$domain.$justDomainsExtension + echo -n "Getting $domain list... " # Use a case statement to download lists that need special cURL commands to complete properly case "$domain" in "adblock.mahakala.is") data=$(curl -s -A 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' -e http://forum.xda-developers.com/ -z $saveLocation $url);; @@ -64,15 +65,15 @@ do esac if [[ -n "$data" ]];then - echo "Getting $domain list..." # Remove comments and print only the domain name # Most of the lists downloaded are already in hosts file format but the spacing/formating is not contigious # This helps with that and makes it easier to read # It also helps with debugging so each stage of the script can be researched more in depth echo "$data" | awk 'NF {if ($1 !~ "#") { if (NF>1) {print $2} else {print $1}}}' | \ sed -e 's/^[. \t]*//' -e 's/\.\.\+/./g' -e 's/[. \t]*$//' > $saveLocation + echo "Done." else - echo "Skipping $domain list because it does not have any new entries..." + echo "Skipping list because it does not have any new entries." fi done From e464c04490c4f3e336a26af09a9794ed1b9bd70a Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 17:47:22 -0700 Subject: [PATCH 6/8] Ignore domains in ad lists that do not contain `.` characters. This will skip entries such as `localhost`, `android`, `debian` and empty lines as listed in #35. --- gravity.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gravity.sh b/gravity.sh index c6cd6b29..40f87ccf 100755 --- a/gravity.sh +++ b/gravity.sh @@ -70,7 +70,7 @@ do # This helps with that and makes it easier to read # It also helps with debugging so each stage of the script can be researched more in depth echo "$data" | awk 'NF {if ($1 !~ "#") { if (NF>1) {print $2} else {print $1}}}' | \ - sed -e 's/^[. \t]*//' -e 's/\.\.\+/./g' -e 's/[. \t]*$//' > $saveLocation + sed -e 's/^[. \t]*//' -e 's/\.\.\+/./g' -e 's/[. \t]*$//' | grep "\." > $saveLocation echo "Done." else echo "Skipping list because it does not have any new entries." From a26377d2298fabb88aedd331b0fcabfac8066ef6 Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 21:44:41 -0700 Subject: [PATCH 7/8] Append ad list sources to latentWhitelist.txt to prevent them from being filtered. Additional fixes for #35. This will prevent our own sources from being filtered out by competing source lists. --- gravity.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/gravity.sh b/gravity.sh index 40f87ccf..8a6f22da 100755 --- a/gravity.sh +++ b/gravity.sh @@ -113,13 +113,21 @@ function gravity_advanced() if [[ -f $whitelist ]];then # Remove whitelist entries numberOf=$(cat $whitelist | sed '/^\s*$/d' | wc -l) - echo "** Whitelisting $numberOf domain(s)..." + plural=; [[ "$numberOf" != "1" ]] && plural=s + echo "** Whitelisting $numberOf domain${plural}..." # Append a "$" to the end of each line so it can be parsed out with grep -w - echo -n "^$" > $latentWhitelist awk -F '[# \t]' 'NF>0&&$1!="" {print $1"$"}' $whitelist > $latentWhitelist - cat $origin/$matter | grep -vwf $latentWhitelist > $origin/$andLight - gravity_advanced else - cat $origin/$matter > $origin/$andLight - gravity_advanced + rm $latentWhitelist fi + +# Prevent our sources from being pulled into the hole +plural=; [[ "${#sources[@]}" != "1" ]] && plural=s +echo "** Whitelisting ${#sources[@]} ad list source${plural}..." +for url in ${sources[@]} +do + echo "$url" | awk -F '/' '{print $3"$"}' >> $latentWhitelist +done +grep -vwf $latentWhitelist $origin/$matter > $origin/$andLight + +gravity_advanced From 98c94912e185babc6ba6e949c5fa1f30b5a2c1d2 Mon Sep 17 00:00:00 2001 From: korhadris Date: Sat, 22 Aug 2015 23:37:01 -0700 Subject: [PATCH 8/8] Replace use of grep -w with grep -x. Prepend "^" to start of latentWhitelist.txt lines. The -x switch requires a full line match of the regexp, where as -w will try to find the match somewhere in the line, looking for work breaks. Combined with turning the whitelist lines into full regexps, this results in significantly faster parsing. Having "^" prepended to the lines also keeps false whitelisting from occuring, such as the following example: If whitelist.txt contains "google.com" it would whitelist many other sites that end in "google.com" as long as there is a non-word character preceeding the google (such as "-", or "."). --- gravity.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gravity.sh b/gravity.sh index 8a6f22da..9bad747d 100755 --- a/gravity.sh +++ b/gravity.sh @@ -115,8 +115,10 @@ if [[ -f $whitelist ]];then numberOf=$(cat $whitelist | sed '/^\s*$/d' | wc -l) plural=; [[ "$numberOf" != "1" ]] && plural=s echo "** Whitelisting $numberOf domain${plural}..." - # Append a "$" to the end of each line so it can be parsed out with grep -w - awk -F '[# \t]' 'NF>0&&$1!="" {print $1"$"}' $whitelist > $latentWhitelist + # Append a "$" to the end, prepend a "^" to the beginning, and + # replace "." with "\." of each line to turn each entry into a + # regexp so it can be parsed out with grep -x + awk -F '[# \t]' 'NF>0&&$1!="" {print "^"$1"$"}' $whitelist | sed 's/\./\\./g' > $latentWhitelist else rm $latentWhitelist fi @@ -126,8 +128,9 @@ plural=; [[ "${#sources[@]}" != "1" ]] && plural=s echo "** Whitelisting ${#sources[@]} ad list source${plural}..." for url in ${sources[@]} do - echo "$url" | awk -F '/' '{print $3"$"}' >> $latentWhitelist + echo "$url" | awk -F '/' '{print "^"$3"$"}' | sed 's/\./\\./g' >> $latentWhitelist done -grep -vwf $latentWhitelist $origin/$matter > $origin/$andLight + +grep -vxf $latentWhitelist $origin/$matter > $origin/$andLight gravity_advanced