inspirat de Scoate nr telefon publi24
Rezultatul este un fisier CSV, care poate fi deschis in Excel (numerele de telefon au 0, nu e vina mea)
O ia de la pagina 1 de anunturi, dar se poate schimba in cod la linia 77 (acum, cand scriu asta, sunt 27300 pagini de anunturi). Se poate schimba si linkul de unde ia anunturi (pentru a sorta dupa categorie), dar nu am programat nimic atunci cand ajunge la sfarsit si o va lua de la 0 din nou si va sta degeaba pentru ca nu are niciun anunt nou
Cod: http://feed-the-wump.us/raw/iqicohehec
sau local:
requests e singurul modul care necesita instalare
Mai pot adauga si alte lucruri (eg. nume anunt, verifica daca numele are SRL in el, filtreaza numere cu 02 si 03) la cerere.
Rezultatul este un fisier CSV, care poate fi deschis in Excel (numerele de telefon au 0, nu e vina mea)
O ia de la pagina 1 de anunturi, dar se poate schimba in cod la linia 77 (acum, cand scriu asta, sunt 27300 pagini de anunturi). Se poate schimba si linkul de unde ia anunturi (pentru a sorta dupa categorie), dar nu am programat nimic atunci cand ajunge la sfarsit si o va lua de la 0 din nou si va sta degeaba pentru ca nu are niciun anunt nou
Cod: http://feed-the-wump.us/raw/iqicohehec
sau local:
Cod:
import re
import os
import time
import shutil
import threading
import requests
from datetime import timedelta
s = requests.Session()
def scrape(url):
data = {}
html = s.get(url, headers={"User-Agent": "Mozilla/5.0 (Android 11; Mobile; LG-M255; rv:88.0) Gecko/88.0 Firefox/88.0"}).text
try:
data['seller_name'] = html.split('<a itemprop="url" href="/anunturi-utilizator-')[1].split('>')[1].split('<')[0]
except:
data['seller_name'] = None
try:
data['phone_num'] = html.split('var cnt = \'')[1].split('\'')[0]
except:
# probably there's none attached
data['phone_num'] = None
try:
# no i do not like beautifulsoup
loc = html.split('<p class="location">')[1].split('</p>')[0]
loc = re.sub("<[^<]+?>|\\r\\n", "", loc)
loc = loc.replace("Vezi pe harta", "")
loc = loc.strip()
loc = loc.replace(" ", " ")
data['location'] = loc
except:
data['location'] = None
return data
start_time = int(time.time())
def write_data(data):
try:
with open(f'scraped_{start_time}.csv', 'a+', encoding='UTF-8') as file:
for value in data.values():
value = f'"{value}"'
# locations have ,s; final csv entry shouldn't
# yes i know i should use the csv module
# but i do not like modules!!!!!!!!!!!
file.write(value + ',' if ',' not in value else value)
file.write('\n')
file.close()
return True
except:
input("Oi! Are you accessing the file?\nPaused scraping, press Enter...")
return False
# init
os.system("mode con: cols=50 lines=10" if os.name in ('nt', 'dos') else "clear")
# csv headers
write_data({0: "Name", 1: "Phone Number", 2: "Location"})
write_data({0: "", 1: "", 2: ""})
scraped_ads = []
scraped_data = []
def tui():
os.system('cls' if os.name in ('nt', 'dos') else 'clear')
width = shutil.get_terminal_size().columns
print('\x1b[38;2;0;102;170m')
print("Publi24 scraper".center(width))
print('\x1b[38;2;59;255;78m')
print(f"Scraped: {len(scraped_data)}".center(width))
print(f"Elapsed: {timedelta(seconds=int(time.time()) - start_time)}".center(width))
print(f"CPM: {int(len(scraped_data) / ((int(time.time()) - start_time) + 1) * 60)}".center(width))
print()
def tui_worker():
time.sleep(1.2)
tui()
threading.Thread(target=tui_worker).start()
i = 1
while True:
page = s.get("https://www.publi24.ro/anunturi/?pag=" + str(i)).text
i -=- 1
ads = re.findall(r"https://www\.publi24\.ro/anunturi/.*/[a-z0-9]{30,}\.html", page)
ads = list(set(ads)) # no duplicates
for ad in ads:
tui()
if ad in scraped_ads: continue
scdat = scrape(ad)
scraped_ads.append(ad)
if scdat in scraped_data: continue
if write_data(scdat):
scraped_data.append(scdat)
Mai pot adauga si alte lucruri (eg. nume anunt, verifica daca numele are SRL in el, filtreaza numere cu 02 si 03) la cerere.