diff --git a/.github/scripts/broken_link_commit_check.py b/.github/scripts/broken_link_commit_check.py new file mode 100644 index 0000000..22cfbe9 --- /dev/null +++ b/.github/scripts/broken_link_commit_check.py @@ -0,0 +1,38 @@ +import requests +import re +from pathlib import Path + +# Simple regex to match URLs +url_pattern = r'https?://[^\s"\']+' + +# Only scan text-based files +TEXT_EXTENSIONS = [".js", ".html", ".md", ".txt"] +files_to_scan = [f for f in Path(".").rglob("*.*") if f.suffix in TEXT_EXTENSIONS] + +broken_links = [] + +def check_link(link, file_path): + try: + r = requests.get(link, timeout=5, allow_redirects=True) + if r.status_code == 404: + broken_links.append(f"{file_path}: {link}") + except Exception: + broken_links.append(f"{file_path}: {link} (error)") + +# Iterate through all selected files +for file in files_to_scan: + try: + with open(file, "r", encoding="utf-8") as f: + for line in f: + for link in re.findall(url_pattern, line): + check_link(link, file) + except Exception: + continue + +# Write broken links report to a temporary file +with open("broken_links_report.txt", "w", encoding="utf-8") as f: + if broken_links: + for link in broken_links: + f.write(link + "\n") + else: + f.write("No broken links found.\n") diff --git a/.github/workflows/blhawk-ci.yml b/.github/workflows/blhawk-ci.yml new file mode 100644 index 0000000..2efe3e1 --- /dev/null +++ b/.github/workflows/blhawk-ci.yml @@ -0,0 +1,41 @@ +name: Broken Link Scan on PR + +on: + pull_request: + branches: + - test-action # only run on 'test-action' branch + +jobs: + broken-link-check: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install requests + + - name: Run Broken Link Scan + id: scan + run: | + python .github/scripts/broken_link_commit_check.py + echo "links_report<> $GITHUB_OUTPUT + cat broken_links_report.txt >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + + - name: Create or Update Comment + uses: peter-evans/create-or-update-comment@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + issue-number: ${{ github.event.pull_request.number }} + body: | + **Broken Link Scan Report** + + ${{ steps.scan.outputs.links_report }} + edit-mode: replace diff --git a/blhawk.py b/blhawk.py index b7e191f..5c906dd 100644 --- a/blhawk.py +++ b/blhawk.py @@ -4,6 +4,7 @@ def main(): parser = argparse.ArgumentParser(prog='BLHawk', description='Dead links aren\'t always dead!', epilog='version: 0.3.0') parser.add_argument('-u', '--url', type=str, help='example: https://www.target.com') + parser.add_argument('--src', type=str, help='Path to source code directory for scan') #parser.add_argument('-l','--list', type=str, help='File containing URLs to check') #parser.add_argument('-t', '--thread', type=int, default=10, help='Number of threads to use (default: 10)') #parser.add_argument('-s', '--silent', help='show only result in output') @@ -11,14 +12,20 @@ def main(): args = parser.parse_args() try: - inputLoader( + if args.url: + inputLoader( url=args.url, #raw_request=args.list, #cookie=args.filename, #thread=args.thread, #silent=args.silent, ) - + elif args.src: + from modules.scan import scan_source + scan_source(args.src) + else: + print("[!] Please provide either --url or --src argument.") + exit(1) except KeyboardInterrupt: print("\n[!] Scan interrupted by user (Ctrl+C). Exiting...") diff --git a/modules/scan.py b/modules/scan.py index 27c6672..839a696 100644 --- a/modules/scan.py +++ b/modules/scan.py @@ -1,4 +1,4 @@ -import requests +import requests, os, re from colorama import Fore, Style, init from urllib.parse import urlparse @@ -60,6 +60,25 @@ def get_service_by_host(host): return service_name, service_info return None, None +def scan_source(path): + """ + Scan source code files in the given directory for URLs and check their vulnerability. + """ + url_pattern = re.compile(r'https?://[\w\.-]+(?:/[\w\./\-\?&%#=]*)?') + for root, dirs, files in os.walk(path): + for file in files: + file_path = os.path.join(root, file) + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + urls = url_pattern.findall(content) + for url in set(urls): + print(f"[SCAN] {file_path}: {url}") + check_vulnerability(url) + except Exception as e: + print(f"[ERROR] Could not read {file_path}: {e}") + + def check_vulnerability(url): parsed = urlparse(url) host = parsed.netloc diff --git a/test_sources/vh-v4.html b/test_sources/vh-v4.html new file mode 100644 index 0000000..f72593b --- /dev/null +++ b/test_sources/vh-v4.html @@ -0,0 +1,7 @@ + + Test Vulnerable HTML + + sdfsdfsdf + GitHub + + \ No newline at end of file diff --git a/test_sources/vuln_html.html b/test_sources/vuln_html.html new file mode 100644 index 0000000..e69de29 diff --git a/test_sources/vuln_html3.html b/test_sources/vuln_html3.html new file mode 100644 index 0000000..22cab0c --- /dev/null +++ b/test_sources/vuln_html3.html @@ -0,0 +1,8 @@ + + Test Vulnerable HTML + + CafeBazaar + Myket + sdfsdfsdf + + \ No newline at end of file diff --git a/test_sources/vuln_js.js b/test_sources/vuln_js.js new file mode 100644 index 0000000..e69de29 diff --git a/test_sources/vuln_md.md b/test_sources/vuln_md.md new file mode 100644 index 0000000..e69de29 diff --git a/test_sources/vuln_python.py b/test_sources/vuln_python.py new file mode 100644 index 0000000..e69de29 diff --git a/test_sources/vuln_test.html b/test_sources/vuln_test.html new file mode 100644 index 0000000..22cfbe9 --- /dev/null +++ b/test_sources/vuln_test.html @@ -0,0 +1,38 @@ +import requests +import re +from pathlib import Path + +# Simple regex to match URLs +url_pattern = r'https?://[^\s"\']+' + +# Only scan text-based files +TEXT_EXTENSIONS = [".js", ".html", ".md", ".txt"] +files_to_scan = [f for f in Path(".").rglob("*.*") if f.suffix in TEXT_EXTENSIONS] + +broken_links = [] + +def check_link(link, file_path): + try: + r = requests.get(link, timeout=5, allow_redirects=True) + if r.status_code == 404: + broken_links.append(f"{file_path}: {link}") + except Exception: + broken_links.append(f"{file_path}: {link} (error)") + +# Iterate through all selected files +for file in files_to_scan: + try: + with open(file, "r", encoding="utf-8") as f: + for line in f: + for link in re.findall(url_pattern, line): + check_link(link, file) + except Exception: + continue + +# Write broken links report to a temporary file +with open("broken_links_report.txt", "w", encoding="utf-8") as f: + if broken_links: + for link in broken_links: + f.write(link + "\n") + else: + f.write("No broken links found.\n")