app.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import requests from bs4 import BeautifulSoup import pandas as pd import time def extract_product_info(url): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5' } try: response = requests.get(url.strip(), headers=headers, timeout=10) response.raise_for_status() except requests.RequestException as e: print(f"Request error for URL {url}: {e}") return None soup = BeautifulSoup(response.content, "lxml") # Extract product title title_tag = soup.find("span", attrs={"id": 'productTitle'}) title = title_tag.get_text(strip=True) if title_tag else "NA" # Extract price with multiple fallbacks price = "NA" price_ids = ['priceblock_ourprice', 'priceblock_dealprice', 'priceblock_saleprice'] for pid in price_ids: price_tag = soup.find("span", attrs={"id": pid}) if price_tag: price = price_tag.get_text(strip=True).replace(',', '') break if price == "NA": price_span = soup.find("span", class_="a-price-whole") if price_span: price = price_span.get_text(strip=True).replace(',', '') # Extract rating with multiple fallbacks rating = "NA" rating_tag = soup.find("i", class_="a-icon a-icon-star a-star-4-5") if rating_tag: rating = rating_tag.get_text(strip=True).replace(',', '') else: rating_span = soup.find("span", class_="a-icon-alt") if rating_span: rating = rating_span.get_text(strip=True).replace(',', '') # Extract review count review_count_tag = soup.find("span", attrs={'id': 'acrCustomerReviewText'}) review_count = review_count_tag.get_text(strip=True).replace(',', '') if review_count_tag else "NA" # Extract availability availability = "NA" availability_div = soup.find("div", attrs={'id': 'availability'}) if availability_div: availability_span = availability_div.find("span") if availability_span: availability = availability_span.get_text(strip=True).replace(',', '') print(f"Title: {title}") print(f"Price: {price}") print(f"Rating: {rating}") print(f"Reviews: {review_count}") print(f"Availability: {availability}") print("-" * 80) return { "Title": title, "Price": price, "Rating": rating, "Reviews": review_count, "Availability": availability, "URL": url.strip() } def main(): try: with open("url.txt", "r", encoding="utf-8") as file: urls = [line.strip() for line in file if line.strip()] except FileNotFoundError: print("The file 'url.txt' was not found.") return data = [] for url in urls: info = extract_product_info(url) if info: data.append(info) time.sleep(2) # Respectful delay between requests if data: df = pd.DataFrame(data) df.to_excel("out.xlsx", index=False) print("Data has been written to 'out.xlsx'.") else: print("No data was extracted.") if __name__ == "__main__": main() |
Make a url.txt
file before running the script and store the Amazon product
url’s as shown below
1 2 3 4 |
https://www.amazon.in/Voltas-Vectra-Platina-Fixed-Window/dp/B0BQYB5YVF/?_encoding=UTF8&pd_rd_w=hq1sE&content-id=amzn1.sym.509965a2-791b-4055-b876-943397d37ed3%3Aamzn1.symc.fc11ad14-99c1-406b-aa77-051d0ba1aade&pf_rd_p=509965a2-791b-4055-b876-943397d37ed3&pf_rd_r=ZYEY4GCZWG1XC6FFCHZS&pd_rd_wg=WJwcO&pd_rd_r=bcabcaaf-9551-472c-a739-23c371f490d3&ref_=pd_hp_d_atf_ci_mcx_mr_ca_hp_atf_d https://www.amazon.in/gp/product/B0BYSDH7P9/ref=ewc_pr_img_1?smid=A1WYWER0W24N8S&th=1&psc=1 https://www.amazon.in/Van-Heusen-Sport-Regular-VSKP517S011408_Black_Small/dp/B076CJFG6J/ref=srd_d_vsims_d_sccl_2_4/261-0648790-8166604?pd_rd_w=eNLyX&content-id=amzn1.sym.7ccbe032-5929-4c88-ab39-4923842061df&pf_rd_p=7ccbe032-5929-4c88-ab39-4923842061df&pf_rd_r=4PCK8VF24A9PMXVC8EES&pd_rd_wg=qnZKc&pd_rd_r=6632a723-14ec-4620-87f4-ba1ecdb21d9a&pd_rd_i=B0F6YQBDYT&psc=1 https://www.amazon.in/Microtek-EM4170-170V-270V-Digital-Stabilizer/dp/B01GTQFHOC/ref=srd_d_vsims_d_sccl_3_2/261-0648790-8166604?pd_rd_w=TR8wz&content-id=amzn1.sym.7ccbe032-5929-4c88-ab39-4923842061df&pf_rd_p=7ccbe032-5929-4c88-ab39-4923842061df&pf_rd_r=GZJFXRMKVK0G6K9N1Z34&pd_rd_wg=J6bwo&pd_rd_r=faaa8d40-b1b5-4179-bcbb-b65fca85475a&pd_rd_i=B01GTQFHOC&psc=1 |