Hackerss.com

hackerss
hackerss

Posted on

Python Scrap a website to obtain all links and images


import requests
from bs4 import BeautifulSoup

def get_links(url):
    """
    Scrap a website to obtain all links and images.

    Parameters
    ----------
    url : str
        The website to scrap.

    Returns
    -------
    list
        A list of all links and images.
    """

    #get the html of the website
    html = requests.get(url).text

    #create a BeautifulSoup object
    soup = BeautifulSoup(html, 'html.parser')

    #find all the links and images on the page
    links = soup.find_all('a')
    images = soup.find_all('img')

    #create a list to store all the links and images
    all_links = []

    #loop through all the links and images and add them to the list
    for link in links:
        all_links.append(link.get('href'))

    for image in images:
        all_links.append(image.get('src'))

    #return the list of links and images
    return all_links

#example
url = 'https://www.python.org/'
links = get_links(url)
#output
print(links)  #['/about/', '/downloads/', '/doc/', '/community/', '/events/', '/blogs/', '/news/', '/jobs/', '/sponsors/', '/security/', '/doc/3.6/', '/doc/3.7/', '/doc/3.8/', '/doc/3.9/', '/doc/3.10/', '/doc/3.11/', '/doc/3.12/', '/doc/3.13/', '/doc/3.14/', '/doc/3.15/', '/doc/3.16/', '/doc/3.17/', '/doc/3.18/', '/doc/3.19/', '/doc/3.20/', '/doc/3.21/', '/doc/3.22/', '/doc/3.23/', '/doc/3.24/', '/doc/3.25/', '/doc/3.26/', '/doc/3.27/', '/doc/3.28/', '/doc/3.29/', '/doc/3.30/', '/doc/3.31/', '/doc/3.32/', '/doc/3.33/', '/doc/3.34/', '/doc/3.35/']
Enter fullscreen mode Exit fullscreen mode

Top comments (0)