[py3] Scraper Publi24 (numere telefon, nume, locatie anunt)

rohr

Member
Registered
inspirat de Scoate nr telefon publi24

1624534401200.png
Rezultatul este un fisier CSV, care poate fi deschis in Excel (numerele de telefon au 0, nu e vina mea)
1624534603900.png1624534780800.png
O ia de la pagina 1 de anunturi, dar se poate schimba in cod la linia 77 (acum, cand scriu asta, sunt 27300 pagini de anunturi). Se poate schimba si linkul de unde ia anunturi (pentru a sorta dupa categorie), dar nu am programat nimic atunci cand ajunge la sfarsit si o va lua de la 0 din nou si va sta degeaba pentru ca nu are niciun anunt nou

Cod: http://feed-the-wump.us/raw/iqicohehec
sau local:
Cod:
import re
import os
import time
import shutil
import threading
import requests
from datetime import timedelta
s = requests.Session()

def scrape(url):
    data = {}
    html = s.get(url, headers={"User-Agent": "Mozilla/5.0 (Android 11; Mobile; LG-M255; rv:88.0) Gecko/88.0 Firefox/88.0"}).text
    try:
        data['seller_name'] = html.split('<a itemprop="url" href="/anunturi-utilizator-')[1].split('>')[1].split('<')[0]
    except:
        data['seller_name'] = None
    try:
        data['phone_num'] = html.split('var cnt = \'')[1].split('\'')[0]
    except:
        # probably there's none attached
        data['phone_num'] = None
    try:
        # no i do not like beautifulsoup
        loc = html.split('<p class="location">')[1].split('</p>')[0]
        loc = re.sub("<[^<]+?>|\\r\\n", "", loc)
        loc = loc.replace("Vezi pe harta", "")
        loc = loc.strip()
        loc = loc.replace("                            ", " ")
        data['location'] = loc
    except:
        data['location'] = None
    return data

start_time = int(time.time())
def write_data(data):
    try:
        with open(f'scraped_{start_time}.csv', 'a+', encoding='UTF-8') as file:
            for value in data.values():
                value = f'"{value}"'
                # locations have ,s; final csv entry shouldn't
                # yes i know i should use the csv module
                # but i do not like modules!!!!!!!!!!!
                file.write(value + ',' if ',' not in value else value)
            file.write('\n')
            file.close()
        return True
    except:
        input("Oi! Are you accessing the file?\nPaused scraping, press Enter...")
        return False

# init
os.system("mode con: cols=50 lines=10" if os.name in ('nt', 'dos') else "clear")
# csv headers
write_data({0: "Name", 1: "Phone Number", 2: "Location"})
write_data({0: "", 1: "", 2: ""})

scraped_ads = []
scraped_data = []

def tui():
    os.system('cls' if os.name in ('nt', 'dos') else 'clear')
    width = shutil.get_terminal_size().columns
    print('\x1b[38;2;0;102;170m')
    print("Publi24 scraper".center(width))
    print('\x1b[38;2;59;255;78m')
    print(f"Scraped: {len(scraped_data)}".center(width))
    print(f"Elapsed: {timedelta(seconds=int(time.time()) - start_time)}".center(width))
    print(f"CPM: {int(len(scraped_data) / ((int(time.time()) - start_time) + 1)  * 60)}".center(width))
    print()

def tui_worker():
    time.sleep(1.2)
    tui()

threading.Thread(target=tui_worker).start()

i = 1
while True:
    page = s.get("https://www.publi24.ro/anunturi/?pag=" + str(i)).text
    i -=- 1
    ads = re.findall(r"https://www\.publi24\.ro/anunturi/.*/[a-z0-9]{30,}\.html", page)
    ads = list(set(ads)) # no duplicates

    for ad in ads:
        tui()
        if ad in scraped_ads: continue
        scdat = scrape(ad)
        scraped_ads.append(ad)
        if scdat in scraped_data: continue
        if write_data(scdat):
            scraped_data.append(scdat)
requests e singurul modul care necesita instalare

Mai pot adauga si alte lucruri (eg. nume anunt, verifica daca numele are SRL in el, filtreaza numere cu 02 si 03) la cerere.
 
Loading...
Back
Sus