fetchUntilDuplicate

This commit is contained in:
Patrick Haßel 2024-06-07 08:58:38 +02:00
parent bf674edde0
commit 2838afc843
4 changed files with 84 additions and 24 deletions

View File

@ -0,0 +1,28 @@
package de.ph87.kleinanzeigen.api;
import lombok.Data;
@Data
public class FetchResult {
private int created = 0;
private int updated = 0;
private int error = 0;
public void add(final MergeResult mergeResult) {
switch (mergeResult) {
case CREATED -> created++;
case UPDATED -> updated++;
case ERROR -> error++;
}
}
public void merge(final FetchResult other) {
this.created += other.created;
this.updated += other.updated;
this.error += other.error;
}
}

View File

@ -8,6 +8,7 @@ import org.telegram.telegrambots.meta.api.objects.MaybeInaccessibleMessage;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.time.LocalDate;
import java.time.LocalTime;
@ -22,11 +23,11 @@ import static de.ph87.kleinanzeigen.api.JSON.objectMapper;
@Slf4j
public class Kleinanzeigen {
private static final int KEEP_LAST_OFFERS_COUNT = 50;
private static final int KEEP_LAST_OFFERS_COUNT = 200;
private static final File FILE = new File("./offers.json");
private static final URI VERSCHENKEN_EPPELBORN_30KM = URI.create("https://www.kleinanzeigen.de/s-zu-verschenken/66571/c192l339r30");
private static final String VERSCHENKEN_EPPELBORN_30KM = "https://www.kleinanzeigen.de/s-zu-verschenken/66571/seite:%d/c192l339r30";
private final List<Offer> offers;
@ -54,7 +55,6 @@ public class Kleinanzeigen {
synchronized (offers) {
removed = _cleanUp();
objectMapper.writerWithDefaultPrettyPrinter().writeValue(FILE, offers);
log.info("Wrote {} offers to file: {}", offers.size(), FILE);
}
removed.forEach(remove);
} catch (IOException e) {
@ -82,30 +82,52 @@ public class Kleinanzeigen {
return deleted;
}
public void fetch() {
public void fetchUntilDuplicate(final int maxPageCount) {
int page = 0;
final FetchResult totalFetchResult = new FetchResult();
while (totalFetchResult.getUpdated() <= 0 && page <= maxPageCount) {
final FetchResult pageFetchResult = fetch(++page);
totalFetchResult.merge(pageFetchResult);
}
log.info("FetchResult: {}", totalFetchResult);
}
private FetchResult fetch(final int page) {
final FetchResult fetchResult = new FetchResult();
try {
final Document document = Jsoup.parse(VERSCHENKEN_EPPELBORN_30KM.toURL(), 3000);
final URI uri = getPageURI(page);
log.info("Fetching page: {}", uri);
final Document document = Jsoup.parse(uri.toURL(), 3000);
for (Element article : document.select("li.ad-listitem:not(.is-topad) article.aditem")) {
final Offer offer;
try {
final Offer offer = parse(article);
merge(offer);
offer = parse(article, uri);
} catch (OfferParseException e) {
log.error("Failed to parse Offer:", e);
fetchResult.add(MergeResult.ERROR);
continue;
}
final MergeResult mergeResult = merge(offer);
fetchResult.add(mergeResult);
}
save();
} catch (IOException e) {
log.error("Failed to fetch Kleinanzeigen: {}", e.toString());
}
return fetchResult;
}
private Offer parse(final Element article) throws OfferParseException {
private URI getPageURI(final int page) throws MalformedURLException {
return URI.create(VERSCHENKEN_EPPELBORN_30KM.formatted(page));
}
private Offer parse(final Element article, final URI uri) throws OfferParseException {
try {
final String id = article.attr("data-adid");
final String title = article.select(".text-module-begin").text();
final String description = article.select(".aditem-main--middle--description").text();
final ZonedDateTime date = parseDate(article.select(".aditem-main--top--right").text());
final String articleURL = VERSCHENKEN_EPPELBORN_30KM.resolve(article.select(".aditem-image a").attr("href")).toString();
final String articleURL = uri.resolve(article.select(".aditem-image a").attr("href")).toString();
final String zipcode;
final String location;
final Integer distance;
@ -124,30 +146,35 @@ public class Kleinanzeigen {
final String imageURL = getImageURL(articleURL);
return new Offer(id, date, title, zipcode, location, distance, description, articleURL, imageURL);
} catch (NumberFormatException | IOException e) {
} catch (NumberFormatException e) {
throw new OfferParseException(article, e);
}
}
private String getImageURL(final String articleURL) throws IOException {
final String imageURL;
private String getImageURL(final String articleURL) {
try {
final Document document = Jsoup.parse(URI.create(articleURL).toURL(), 3000);
final Element image = document.select(".galleryimage-element img").first();
if (image == null) {
imageURL = "";
} else {
imageURL = image.attr("src");
if (image != null) {
return image.attr("src");
}
return imageURL;
} catch (IOException e) {
log.error("Failed to load Article page: {}", articleURL);
}
return "";
}
private void merge(final Offer offer) {
private MergeResult merge(final Offer offer) {
synchronized (offer) {
offers.stream().filter(existing -> existing.getId().equals(offer.getId())).peek(existing -> existing.merge(offer)).findFirst().orElseGet(() -> {
final Optional<Offer> existingOptional = offers.stream().filter(existing -> existing.getId().equals(offer.getId())).findFirst();
if (existingOptional.isPresent()) {
existingOptional.get().merge(offer);
return MergeResult.UPDATED;
} else {
log.info("Created: {}", offer);
offers.add(offer);
return offer;
});
return MergeResult.CREATED;
}
}
}

View File

@ -29,7 +29,7 @@ public class Main {
}
private static void handle(final Bot bot) {
kleinanzeigen.fetch();
kleinanzeigen.fetchUntilDuplicate(5);
kleinanzeigen.findAll().stream().filter(offer -> offer.getTelegramMessageId() == null).forEach(bot::send);
}

View File

@ -0,0 +1,5 @@
package de.ph87.kleinanzeigen.api;
public enum MergeResult {
CREATED, UPDATED, ERROR
}