summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2022-10-18 21:25:28 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2022-10-18 21:25:28 +0200
commit1fe88f56190adb26b65d28c48525161534d40cb7 (patch)
tree8c65d888603788abf673985e1550c47293ec6645
downloadnzzgatherer-1fe88f56190adb26b65d28c48525161534d40cb7.tar.gz
nzzgatherer-1fe88f56190adb26b65d28c48525161534d40cb7.tar.bz2
a very first version which is able to download the current NZZ of the day
-rw-r--r--.gitignore2
-rw-r--r--README18
-rw-r--r--config.json10
-rw-r--r--downloads/.gitkeep0
-rwxr-xr-xnzzgrabber10
-rw-r--r--pom.xml88
-rw-r--r--src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java20
-rw-r--r--src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java189
-rw-r--r--src/main/resources/log4j2.xml18
9 files changed, 355 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3ff9b2b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+dependency-reduced-pom.xml
+target
diff --git a/README b/README
new file mode 100644
index 0000000..c9215ef
--- /dev/null
+++ b/README
@@ -0,0 +1,18 @@
+record page and generate code:
+mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="codegen playwright.dev"
+
+build:
+mvn -DskipTests clean install
+
+run:
+./nzzgatherer -c config.json
+
+links
+-----
+
+- Playwright, web site scrapping
+ - https://javadoc.io/doc/com.microsoft.playwright/playwright/latest/index.html
+ - https://playwright.dev/java/docs/debug
+ - https://playwright.dev/java/docs/codegen-intro
+- command line parsing
+ - https://picocli.info/
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..67c807c
--- /dev/null
+++ b/config.json
@@ -0,0 +1,10 @@
+{
+ "credentials" : {
+ "login" : "mail@andreasbaumann.cc",
+ "password" : "N7c42NvEvE",
+ "user" : "Andreas Baumann"
+ },
+ "downloads" : {
+ "directory" : "./downloads"
+ }
+}
diff --git a/downloads/.gitkeep b/downloads/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/downloads/.gitkeep
diff --git a/nzzgrabber b/nzzgrabber
new file mode 100755
index 0000000..ea36a5c
--- /dev/null
+++ b/nzzgrabber
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+export LANG=en_US.UTF-8
+
+DEBUG_OPTS=""
+#DEBUG_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=8000"
+NO_COLOR=yes \
+ java -jar -Xms128m -Xmx1G $DEBUG_OPTS \
+ -Djava.util.logging.config.file=./logging.properties \
+ target/nzz-grabber-1.0-SNAPSHOT.jar $*
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..08f6017
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,88 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+ http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>cc.andreasbaumann.grabbers</groupId>
+ <artifactId>nzz-grabber</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <name>NZZ Grabber</name>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.5.1</version>
+ <configuration>
+ <source>1.8</source>
+ <target>1.8</target>
+ <debug>true</debug>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>2.4.3</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <transformers>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+ <mainClass>cc.andreasbaumann.grabbers.nzz.Main</mainClass>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
+ <version>1.15.3</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.11.0</version>
+ </dependency>
+ <dependency>
+ <groupId>com.microsoft.playwright</groupId>
+ <artifactId>playwright</artifactId>
+ <version>1.27.1</version>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <version>2.13.4.2</version>
+ </dependency>
+ <dependency>
+ <groupId>info.picocli</groupId>
+ <artifactId>picocli</artifactId>
+ <version>4.6.3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-api</artifactId>
+ <version>2.6.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ <version>2.6.1</version>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java b/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java
new file mode 100644
index 0000000..4c97b3f
--- /dev/null
+++ b/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java
@@ -0,0 +1,20 @@
+package cc.andreasbaumann.grabbers.nzz;
+
+public class Configuration
+{
+ static class Credentials
+ {
+ public String login;
+ public String password;
+ public String user;
+ }
+
+ public Credentials credentials;
+
+ static class Downloads
+ {
+ public String directory;
+ }
+
+ public Downloads downloads;
+}
diff --git a/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java b/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java
new file mode 100644
index 0000000..28c1f00
--- /dev/null
+++ b/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java
@@ -0,0 +1,189 @@
+package cc.andreasbaumann.grabbers.nzz;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.charset.StandardCharsets;
+
+import java.io.File;
+import java.io.BufferedReader;
+import java.io.IOException;
+
+import com.fasterxml.jackson.core.JsonParser.Feature;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import picocli.CommandLine;
+import picocli.CommandLine.Command;
+import picocli.CommandLine.Option;
+import picocli.CommandLine.Parameters;
+import picocli.CommandLine.Help.Ansi;
+import java.util.concurrent.Callable;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import com.microsoft.playwright.Playwright;
+import com.microsoft.playwright.BrowserType;
+import com.microsoft.playwright.Browser;
+import com.microsoft.playwright.BrowserContext;
+import com.microsoft.playwright.Page;
+import com.microsoft.playwright.Download;
+import com.microsoft.playwright.options.AriaRole;
+
+import java.util.Locale;
+import java.util.Arrays;
+import java.util.Date;
+import java.text.SimpleDateFormat;
+
+public class Main
+{
+ private static final Logger LOGGER = LogManager.getFormatterLogger( Main.class );
+ public static final String USER_AGENT = "NZZ-Grabber/1.0";
+ public static final Locale DEFAULT_LOCALE = new Locale( "de", "CH" );
+ public static final int DEFAULT_TIMEOUT = 60000;
+
+ @Command( name = "nzzgatherer", mixinStandardHelpOptions = true, version = "1.0",
+ description = "Grabs NZZ ePaper PDFs and stores them locally.")
+ static class NZZGatherer implements Callable<Integer>
+ {
+ private static Configuration configuration;
+ private static Playwright playwright;
+ private static BrowserType browserType;
+ private static Browser browser = null;
+ private static BrowserContext context;
+ private static Locale locale = DEFAULT_LOCALE;
+ private static Page page;
+
+ private static void initializePlaywright( boolean notHeadless )
+ {
+ LOGGER.info( "Starting playwright..." );
+ playwright = Playwright.create( );
+ browserType = playwright.chromium( );
+ browser = browserType.launch( new BrowserType.LaunchOptions( )
+ .setHeadless( !notHeadless )
+ .setArgs( Arrays.asList( "--disable-gpu" ) )
+ );
+ context = browser.newContext( new Browser.NewContextOptions( )
+ .setUserAgent( USER_AGENT )
+ .setLocale( locale.toString( ) )
+ );
+ }
+
+ private static void logout( ) throws Exception
+ {
+ LOGGER.info( ">>> Logging out.." );
+ page.getByText( configuration.credentials.user ).click( );
+ page.getByText( "Abmelden" ).click( );
+ }
+
+ private static void downloadCurrent( ) throws Exception
+ {
+ LOGGER.info( ">>> Downloading current PDF..." );
+ Download download = page.waitForDownload( ( ) -> {
+ page.locator( "div:nth-child(2) > span" ).first( ).click( );
+ });
+ File directory = new File( configuration.downloads.directory );
+ Date today = new Date( );
+ String timeStamp = new SimpleDateFormat( "yyyyMMdd").format( today );
+ File file = new File( directory, "NZZ_" + timeStamp + ".pdf" );
+ LOGGER.info( ">>> Saving to '" + file + "'.." );
+ download.saveAs( file.toPath( ) );
+ }
+
+ private static void initialize( ) throws Exception
+ {
+ LOGGER.info( ">>> Opening NZZ ePaper..." );
+ page = context.newPage( );
+ page.setDefaultTimeout( DEFAULT_TIMEOUT );
+ page.route( "**", route -> {
+ LOGGER.info( route.request( ).url( ) );
+ route.resume( );
+ } );
+ page.onLoad( p -> LOGGER.info( "Page loaded!" ) );
+ page.onDOMContentLoaded( p -> LOGGER.info( "Page DOM content loaded!" ) );
+ }
+
+ private static void login( ) throws Exception
+ {
+
+ LOGGER.info( ">>> Opening NZZ ePaper.." );
+ page.navigate( "https://epaper.nzz.ch/" );
+ page.waitForSelector( ":text('Anmelden')" );
+ page.waitForLoadState( );
+
+ LOGGER.info( ">>> Navigate to login page.." );
+ page.getByText( "Anmelden" ).click( );
+ page.waitForSelector( ":text('E-Mail-Adresse')" );
+ page.waitForLoadState( );
+
+ LOGGER.info( ">>> Inserting email data.." );
+ page.getByPlaceholder( "E-Mail-Adresse" ).fill( configuration.credentials.login );
+ page.getByRole( AriaRole.BUTTON, new Page.GetByRoleOptions( ).setName( "Weiter" ) ).click( );
+ page.waitForLoadState( );
+
+ LOGGER.info( ">>> Inserting password.." );
+ page.getByRole( AriaRole.TEXTBOX, new Page.GetByRoleOptions( ).setName( "Passwort*" ) ).fill( configuration.credentials.password );
+ page.waitForLoadState( );
+ page.getByRole(AriaRole.BUTTON, new Page.GetByRoleOptions( ).setName( "Anmelden" )).click( );
+ page.waitForLoadState( );
+
+ LOGGER.info( ">>> Downloading current PDF.." );
+ Download download = page.waitForDownload(() -> {
+ page.locator("div:nth-child(2) > span").first( ).click( );
+ } );
+
+ Thread.sleep( 20000 );
+ }
+
+ private static void initializeFromFile( Path configurationFile )
+ {
+ ObjectMapper objectMapper = new ObjectMapper( );
+ objectMapper.enable( Feature.ALLOW_UNQUOTED_FIELD_NAMES );
+ objectMapper.enable( Feature.ALLOW_COMMENTS );
+
+ try( BufferedReader configurationReader = Files.newBufferedReader( configurationFile, StandardCharsets.UTF_8 ) ) {
+ LOGGER.info( "Reading configuration from '" + configurationFile + "'.." );
+ configuration = objectMapper.readValue( configurationReader, Configuration.class );
+ } catch ( IOException e ) {
+ LOGGER.error( "Failed to read the configuration file '" + configurationFile + "':\n", e );
+ System.exit( 1 );
+ }
+ }
+
+ @Option( names = { "-c", "--config" }, description = "file (in JSON)", defaultValue = "config.json" )
+ private String configFile = "config.json";
+
+ @Option( names = { "--download-current" }, description = "download only todays PDF" )
+ private boolean downloadCurrent = false;
+
+ @Option( names = { "--not-headless" }, description = "show browser" )
+ private boolean notHeadless = false;
+
+ @Override
+ public Integer call( ) throws Exception
+ {
+ initializeFromFile( new File( configFile ).toPath( ) );
+ initializePlaywright( notHeadless );
+ initialize( );
+ login( );
+ if( downloadCurrent ) {
+ downloadCurrent( );
+ }
+ logout( );
+
+ return 0;
+ }
+ }
+
+ public static void main( String... args )
+ {
+ try {
+ int exitCode = new CommandLine( new NZZGatherer( ) ).execute( args );
+ System.exit( exitCode );
+ } catch( Exception e ) {
+ LOGGER.error( e );
+ System.exit( 1 );
+ }
+ }
+}
+
+
diff --git a/src/main/resources/log4j2.xml b/src/main/resources/log4j2.xml
new file mode 100644
index 0000000..72b308f
--- /dev/null
+++ b/src/main/resources/log4j2.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+
+<Configuration monitorInterval="1" packages="com.eurospider">
+ <Appenders>
+ <File name="main" fileName="./nzzgrabber.log">
+ <PatternLayout pattern="%d %p [%c] - &lt;%m&gt;%n"/>
+ </File>
+ <Console name="console">
+ <PatternLayout pattern="%p %m%n"/>
+ </Console>
+ </Appenders>
+ <Loggers>
+ <Root level="info">
+ <AppenderRef ref="main"/>
+ <AppenderRef ref="console"/>
+ </Root>
+ </Loggers>
+</Configuration>