From 1fe88f56190adb26b65d28c48525161534d40cb7 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Tue, 18 Oct 2022 21:25:28 +0200 Subject: a very first version which is able to download the current NZZ of the day --- .gitignore | 2 + README | 18 ++ config.json | 10 ++ downloads/.gitkeep | 0 nzzgrabber | 10 ++ pom.xml | 88 ++++++++++ .../andreasbaumann/grabbers/nzz/Configuration.java | 20 +++ .../java/cc/andreasbaumann/grabbers/nzz/Main.java | 189 +++++++++++++++++++++ src/main/resources/log4j2.xml | 18 ++ 9 files changed, 355 insertions(+) create mode 100644 .gitignore create mode 100644 README create mode 100644 config.json create mode 100644 downloads/.gitkeep create mode 100755 nzzgrabber create mode 100644 pom.xml create mode 100644 src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java create mode 100644 src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java create mode 100644 src/main/resources/log4j2.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3ff9b2b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +dependency-reduced-pom.xml +target diff --git a/README b/README new file mode 100644 index 0000000..c9215ef --- /dev/null +++ b/README @@ -0,0 +1,18 @@ +record page and generate code: +mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="codegen playwright.dev" + +build: +mvn -DskipTests clean install + +run: +./nzzgatherer -c config.json + +links +----- + +- Playwright, web site scrapping + - https://javadoc.io/doc/com.microsoft.playwright/playwright/latest/index.html + - https://playwright.dev/java/docs/debug + - https://playwright.dev/java/docs/codegen-intro +- command line parsing + - https://picocli.info/ diff --git a/config.json b/config.json new file mode 100644 index 0000000..67c807c --- /dev/null +++ b/config.json @@ -0,0 +1,10 @@ +{ + "credentials" : { + "login" : "mail@andreasbaumann.cc", + "password" : "N7c42NvEvE", + "user" : "Andreas Baumann" + }, + "downloads" : { + "directory" : "./downloads" + } +} diff --git a/downloads/.gitkeep b/downloads/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/nzzgrabber b/nzzgrabber new file mode 100755 index 0000000..ea36a5c --- /dev/null +++ b/nzzgrabber @@ -0,0 +1,10 @@ +#!/bin/sh + +export LANG=en_US.UTF-8 + +DEBUG_OPTS="" +#DEBUG_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=8000" +NO_COLOR=yes \ + java -jar -Xms128m -Xmx1G $DEBUG_OPTS \ + -Djava.util.logging.config.file=./logging.properties \ + target/nzz-grabber-1.0-SNAPSHOT.jar $* diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..08f6017 --- /dev/null +++ b/pom.xml @@ -0,0 +1,88 @@ + + 4.0.0 + + cc.andreasbaumann.grabbers + nzz-grabber + 1.0-SNAPSHOT + jar + NZZ Grabber + + + UTF-8 + + + + + + maven-compiler-plugin + 3.5.1 + + 1.8 + 1.8 + true + + + + org.apache.maven.plugins + maven-shade-plugin + 2.4.3 + + + package + + shade + + + + + cc.andreasbaumann.grabbers.nzz.Main + + + + + + + + + + + + org.jsoup + jsoup + 1.15.3 + + + commons-io + commons-io + 2.11.0 + + + com.microsoft.playwright + playwright + 1.27.1 + + + com.fasterxml.jackson.core + jackson-databind + 2.13.4.2 + + + info.picocli + picocli + 4.6.3 + + + org.apache.logging.log4j + log4j-api + 2.6.1 + + + org.apache.logging.log4j + log4j-core + 2.6.1 + + + diff --git a/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java b/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java new file mode 100644 index 0000000..4c97b3f --- /dev/null +++ b/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java @@ -0,0 +1,20 @@ +package cc.andreasbaumann.grabbers.nzz; + +public class Configuration +{ + static class Credentials + { + public String login; + public String password; + public String user; + } + + public Credentials credentials; + + static class Downloads + { + public String directory; + } + + public Downloads downloads; +} diff --git a/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java b/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java new file mode 100644 index 0000000..28c1f00 --- /dev/null +++ b/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java @@ -0,0 +1,189 @@ +package cc.andreasbaumann.grabbers.nzz; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.charset.StandardCharsets; + +import java.io.File; +import java.io.BufferedReader; +import java.io.IOException; + +import com.fasterxml.jackson.core.JsonParser.Feature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import picocli.CommandLine; +import picocli.CommandLine.Command; +import picocli.CommandLine.Option; +import picocli.CommandLine.Parameters; +import picocli.CommandLine.Help.Ansi; +import java.util.concurrent.Callable; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import com.microsoft.playwright.Playwright; +import com.microsoft.playwright.BrowserType; +import com.microsoft.playwright.Browser; +import com.microsoft.playwright.BrowserContext; +import com.microsoft.playwright.Page; +import com.microsoft.playwright.Download; +import com.microsoft.playwright.options.AriaRole; + +import java.util.Locale; +import java.util.Arrays; +import java.util.Date; +import java.text.SimpleDateFormat; + +public class Main +{ + private static final Logger LOGGER = LogManager.getFormatterLogger( Main.class ); + public static final String USER_AGENT = "NZZ-Grabber/1.0"; + public static final Locale DEFAULT_LOCALE = new Locale( "de", "CH" ); + public static final int DEFAULT_TIMEOUT = 60000; + + @Command( name = "nzzgatherer", mixinStandardHelpOptions = true, version = "1.0", + description = "Grabs NZZ ePaper PDFs and stores them locally.") + static class NZZGatherer implements Callable + { + private static Configuration configuration; + private static Playwright playwright; + private static BrowserType browserType; + private static Browser browser = null; + private static BrowserContext context; + private static Locale locale = DEFAULT_LOCALE; + private static Page page; + + private static void initializePlaywright( boolean notHeadless ) + { + LOGGER.info( "Starting playwright..." ); + playwright = Playwright.create( ); + browserType = playwright.chromium( ); + browser = browserType.launch( new BrowserType.LaunchOptions( ) + .setHeadless( !notHeadless ) + .setArgs( Arrays.asList( "--disable-gpu" ) ) + ); + context = browser.newContext( new Browser.NewContextOptions( ) + .setUserAgent( USER_AGENT ) + .setLocale( locale.toString( ) ) + ); + } + + private static void logout( ) throws Exception + { + LOGGER.info( ">>> Logging out.." ); + page.getByText( configuration.credentials.user ).click( ); + page.getByText( "Abmelden" ).click( ); + } + + private static void downloadCurrent( ) throws Exception + { + LOGGER.info( ">>> Downloading current PDF..." ); + Download download = page.waitForDownload( ( ) -> { + page.locator( "div:nth-child(2) > span" ).first( ).click( ); + }); + File directory = new File( configuration.downloads.directory ); + Date today = new Date( ); + String timeStamp = new SimpleDateFormat( "yyyyMMdd").format( today ); + File file = new File( directory, "NZZ_" + timeStamp + ".pdf" ); + LOGGER.info( ">>> Saving to '" + file + "'.." ); + download.saveAs( file.toPath( ) ); + } + + private static void initialize( ) throws Exception + { + LOGGER.info( ">>> Opening NZZ ePaper..." ); + page = context.newPage( ); + page.setDefaultTimeout( DEFAULT_TIMEOUT ); + page.route( "**", route -> { + LOGGER.info( route.request( ).url( ) ); + route.resume( ); + } ); + page.onLoad( p -> LOGGER.info( "Page loaded!" ) ); + page.onDOMContentLoaded( p -> LOGGER.info( "Page DOM content loaded!" ) ); + } + + private static void login( ) throws Exception + { + + LOGGER.info( ">>> Opening NZZ ePaper.." ); + page.navigate( "https://epaper.nzz.ch/" ); + page.waitForSelector( ":text('Anmelden')" ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Navigate to login page.." ); + page.getByText( "Anmelden" ).click( ); + page.waitForSelector( ":text('E-Mail-Adresse')" ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Inserting email data.." ); + page.getByPlaceholder( "E-Mail-Adresse" ).fill( configuration.credentials.login ); + page.getByRole( AriaRole.BUTTON, new Page.GetByRoleOptions( ).setName( "Weiter" ) ).click( ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Inserting password.." ); + page.getByRole( AriaRole.TEXTBOX, new Page.GetByRoleOptions( ).setName( "Passwort*" ) ).fill( configuration.credentials.password ); + page.waitForLoadState( ); + page.getByRole(AriaRole.BUTTON, new Page.GetByRoleOptions( ).setName( "Anmelden" )).click( ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Downloading current PDF.." ); + Download download = page.waitForDownload(() -> { + page.locator("div:nth-child(2) > span").first( ).click( ); + } ); + + Thread.sleep( 20000 ); + } + + private static void initializeFromFile( Path configurationFile ) + { + ObjectMapper objectMapper = new ObjectMapper( ); + objectMapper.enable( Feature.ALLOW_UNQUOTED_FIELD_NAMES ); + objectMapper.enable( Feature.ALLOW_COMMENTS ); + + try( BufferedReader configurationReader = Files.newBufferedReader( configurationFile, StandardCharsets.UTF_8 ) ) { + LOGGER.info( "Reading configuration from '" + configurationFile + "'.." ); + configuration = objectMapper.readValue( configurationReader, Configuration.class ); + } catch ( IOException e ) { + LOGGER.error( "Failed to read the configuration file '" + configurationFile + "':\n", e ); + System.exit( 1 ); + } + } + + @Option( names = { "-c", "--config" }, description = "file (in JSON)", defaultValue = "config.json" ) + private String configFile = "config.json"; + + @Option( names = { "--download-current" }, description = "download only todays PDF" ) + private boolean downloadCurrent = false; + + @Option( names = { "--not-headless" }, description = "show browser" ) + private boolean notHeadless = false; + + @Override + public Integer call( ) throws Exception + { + initializeFromFile( new File( configFile ).toPath( ) ); + initializePlaywright( notHeadless ); + initialize( ); + login( ); + if( downloadCurrent ) { + downloadCurrent( ); + } + logout( ); + + return 0; + } + } + + public static void main( String... args ) + { + try { + int exitCode = new CommandLine( new NZZGatherer( ) ).execute( args ); + System.exit( exitCode ); + } catch( Exception e ) { + LOGGER.error( e ); + System.exit( 1 ); + } + } +} + + diff --git a/src/main/resources/log4j2.xml b/src/main/resources/log4j2.xml new file mode 100644 index 0000000..72b308f --- /dev/null +++ b/src/main/resources/log4j2.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + -- cgit v1.2.3-54-g00ecf