diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2022-10-18 21:25:28 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2022-10-18 21:25:28 +0200 |
commit | 1fe88f56190adb26b65d28c48525161534d40cb7 (patch) | |
tree | 8c65d888603788abf673985e1550c47293ec6645 | |
download | nzzgatherer-1fe88f56190adb26b65d28c48525161534d40cb7.tar.gz nzzgatherer-1fe88f56190adb26b65d28c48525161534d40cb7.tar.bz2 |
a very first version which is able to download the current NZZ of the day
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | README | 18 | ||||
-rw-r--r-- | config.json | 10 | ||||
-rw-r--r-- | downloads/.gitkeep | 0 | ||||
-rwxr-xr-x | nzzgrabber | 10 | ||||
-rw-r--r-- | pom.xml | 88 | ||||
-rw-r--r-- | src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java | 20 | ||||
-rw-r--r-- | src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java | 189 | ||||
-rw-r--r-- | src/main/resources/log4j2.xml | 18 |
9 files changed, 355 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3ff9b2b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +dependency-reduced-pom.xml +target @@ -0,0 +1,18 @@ +record page and generate code: +mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="codegen playwright.dev" + +build: +mvn -DskipTests clean install + +run: +./nzzgatherer -c config.json + +links +----- + +- Playwright, web site scrapping + - https://javadoc.io/doc/com.microsoft.playwright/playwright/latest/index.html + - https://playwright.dev/java/docs/debug + - https://playwright.dev/java/docs/codegen-intro +- command line parsing + - https://picocli.info/ diff --git a/config.json b/config.json new file mode 100644 index 0000000..67c807c --- /dev/null +++ b/config.json @@ -0,0 +1,10 @@ +{ + "credentials" : { + "login" : "mail@andreasbaumann.cc", + "password" : "N7c42NvEvE", + "user" : "Andreas Baumann" + }, + "downloads" : { + "directory" : "./downloads" + } +} diff --git a/downloads/.gitkeep b/downloads/.gitkeep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/downloads/.gitkeep diff --git a/nzzgrabber b/nzzgrabber new file mode 100755 index 0000000..ea36a5c --- /dev/null +++ b/nzzgrabber @@ -0,0 +1,10 @@ +#!/bin/sh + +export LANG=en_US.UTF-8 + +DEBUG_OPTS="" +#DEBUG_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=8000" +NO_COLOR=yes \ + java -jar -Xms128m -Xmx1G $DEBUG_OPTS \ + -Djava.util.logging.config.file=./logging.properties \ + target/nzz-grabber-1.0-SNAPSHOT.jar $* @@ -0,0 +1,88 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>cc.andreasbaumann.grabbers</groupId> + <artifactId>nzz-grabber</artifactId> + <version>1.0-SNAPSHOT</version> + <packaging>jar</packaging> + <name>NZZ Grabber</name> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <build> + <plugins> + <plugin> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.5.1</version> + <configuration> + <source>1.8</source> + <target>1.8</target> + <debug>true</debug> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.4.3</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>cc.andreasbaumann.grabbers.nzz.Main</mainClass> + </transformer> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + + <dependencies> + <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.15.3</version> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.11.0</version> + </dependency> + <dependency> + <groupId>com.microsoft.playwright</groupId> + <artifactId>playwright</artifactId> + <version>1.27.1</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <version>2.13.4.2</version> + </dependency> + <dependency> + <groupId>info.picocli</groupId> + <artifactId>picocli</artifactId> + <version>4.6.3</version> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + <version>2.6.1</version> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <version>2.6.1</version> + </dependency> + </dependencies> +</project> diff --git a/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java b/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java new file mode 100644 index 0000000..4c97b3f --- /dev/null +++ b/src/main/java/cc/andreasbaumann/grabbers/nzz/Configuration.java @@ -0,0 +1,20 @@ +package cc.andreasbaumann.grabbers.nzz; + +public class Configuration +{ + static class Credentials + { + public String login; + public String password; + public String user; + } + + public Credentials credentials; + + static class Downloads + { + public String directory; + } + + public Downloads downloads; +} diff --git a/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java b/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java new file mode 100644 index 0000000..28c1f00 --- /dev/null +++ b/src/main/java/cc/andreasbaumann/grabbers/nzz/Main.java @@ -0,0 +1,189 @@ +package cc.andreasbaumann.grabbers.nzz; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.charset.StandardCharsets; + +import java.io.File; +import java.io.BufferedReader; +import java.io.IOException; + +import com.fasterxml.jackson.core.JsonParser.Feature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import picocli.CommandLine; +import picocli.CommandLine.Command; +import picocli.CommandLine.Option; +import picocli.CommandLine.Parameters; +import picocli.CommandLine.Help.Ansi; +import java.util.concurrent.Callable; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import com.microsoft.playwright.Playwright; +import com.microsoft.playwright.BrowserType; +import com.microsoft.playwright.Browser; +import com.microsoft.playwright.BrowserContext; +import com.microsoft.playwright.Page; +import com.microsoft.playwright.Download; +import com.microsoft.playwright.options.AriaRole; + +import java.util.Locale; +import java.util.Arrays; +import java.util.Date; +import java.text.SimpleDateFormat; + +public class Main +{ + private static final Logger LOGGER = LogManager.getFormatterLogger( Main.class ); + public static final String USER_AGENT = "NZZ-Grabber/1.0"; + public static final Locale DEFAULT_LOCALE = new Locale( "de", "CH" ); + public static final int DEFAULT_TIMEOUT = 60000; + + @Command( name = "nzzgatherer", mixinStandardHelpOptions = true, version = "1.0", + description = "Grabs NZZ ePaper PDFs and stores them locally.") + static class NZZGatherer implements Callable<Integer> + { + private static Configuration configuration; + private static Playwright playwright; + private static BrowserType browserType; + private static Browser browser = null; + private static BrowserContext context; + private static Locale locale = DEFAULT_LOCALE; + private static Page page; + + private static void initializePlaywright( boolean notHeadless ) + { + LOGGER.info( "Starting playwright..." ); + playwright = Playwright.create( ); + browserType = playwright.chromium( ); + browser = browserType.launch( new BrowserType.LaunchOptions( ) + .setHeadless( !notHeadless ) + .setArgs( Arrays.asList( "--disable-gpu" ) ) + ); + context = browser.newContext( new Browser.NewContextOptions( ) + .setUserAgent( USER_AGENT ) + .setLocale( locale.toString( ) ) + ); + } + + private static void logout( ) throws Exception + { + LOGGER.info( ">>> Logging out.." ); + page.getByText( configuration.credentials.user ).click( ); + page.getByText( "Abmelden" ).click( ); + } + + private static void downloadCurrent( ) throws Exception + { + LOGGER.info( ">>> Downloading current PDF..." ); + Download download = page.waitForDownload( ( ) -> { + page.locator( "div:nth-child(2) > span" ).first( ).click( ); + }); + File directory = new File( configuration.downloads.directory ); + Date today = new Date( ); + String timeStamp = new SimpleDateFormat( "yyyyMMdd").format( today ); + File file = new File( directory, "NZZ_" + timeStamp + ".pdf" ); + LOGGER.info( ">>> Saving to '" + file + "'.." ); + download.saveAs( file.toPath( ) ); + } + + private static void initialize( ) throws Exception + { + LOGGER.info( ">>> Opening NZZ ePaper..." ); + page = context.newPage( ); + page.setDefaultTimeout( DEFAULT_TIMEOUT ); + page.route( "**", route -> { + LOGGER.info( route.request( ).url( ) ); + route.resume( ); + } ); + page.onLoad( p -> LOGGER.info( "Page loaded!" ) ); + page.onDOMContentLoaded( p -> LOGGER.info( "Page DOM content loaded!" ) ); + } + + private static void login( ) throws Exception + { + + LOGGER.info( ">>> Opening NZZ ePaper.." ); + page.navigate( "https://epaper.nzz.ch/" ); + page.waitForSelector( ":text('Anmelden')" ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Navigate to login page.." ); + page.getByText( "Anmelden" ).click( ); + page.waitForSelector( ":text('E-Mail-Adresse')" ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Inserting email data.." ); + page.getByPlaceholder( "E-Mail-Adresse" ).fill( configuration.credentials.login ); + page.getByRole( AriaRole.BUTTON, new Page.GetByRoleOptions( ).setName( "Weiter" ) ).click( ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Inserting password.." ); + page.getByRole( AriaRole.TEXTBOX, new Page.GetByRoleOptions( ).setName( "Passwort*" ) ).fill( configuration.credentials.password ); + page.waitForLoadState( ); + page.getByRole(AriaRole.BUTTON, new Page.GetByRoleOptions( ).setName( "Anmelden" )).click( ); + page.waitForLoadState( ); + + LOGGER.info( ">>> Downloading current PDF.." ); + Download download = page.waitForDownload(() -> { + page.locator("div:nth-child(2) > span").first( ).click( ); + } ); + + Thread.sleep( 20000 ); + } + + private static void initializeFromFile( Path configurationFile ) + { + ObjectMapper objectMapper = new ObjectMapper( ); + objectMapper.enable( Feature.ALLOW_UNQUOTED_FIELD_NAMES ); + objectMapper.enable( Feature.ALLOW_COMMENTS ); + + try( BufferedReader configurationReader = Files.newBufferedReader( configurationFile, StandardCharsets.UTF_8 ) ) { + LOGGER.info( "Reading configuration from '" + configurationFile + "'.." ); + configuration = objectMapper.readValue( configurationReader, Configuration.class ); + } catch ( IOException e ) { + LOGGER.error( "Failed to read the configuration file '" + configurationFile + "':\n", e ); + System.exit( 1 ); + } + } + + @Option( names = { "-c", "--config" }, description = "file (in JSON)", defaultValue = "config.json" ) + private String configFile = "config.json"; + + @Option( names = { "--download-current" }, description = "download only todays PDF" ) + private boolean downloadCurrent = false; + + @Option( names = { "--not-headless" }, description = "show browser" ) + private boolean notHeadless = false; + + @Override + public Integer call( ) throws Exception + { + initializeFromFile( new File( configFile ).toPath( ) ); + initializePlaywright( notHeadless ); + initialize( ); + login( ); + if( downloadCurrent ) { + downloadCurrent( ); + } + logout( ); + + return 0; + } + } + + public static void main( String... args ) + { + try { + int exitCode = new CommandLine( new NZZGatherer( ) ).execute( args ); + System.exit( exitCode ); + } catch( Exception e ) { + LOGGER.error( e ); + System.exit( 1 ); + } + } +} + + diff --git a/src/main/resources/log4j2.xml b/src/main/resources/log4j2.xml new file mode 100644 index 0000000..72b308f --- /dev/null +++ b/src/main/resources/log4j2.xml @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="UTF-8" ?> + +<Configuration monitorInterval="1" packages="com.eurospider"> + <Appenders> + <File name="main" fileName="./nzzgrabber.log"> + <PatternLayout pattern="%d %p [%c] - <%m>%n"/> + </File> + <Console name="console"> + <PatternLayout pattern="%p %m%n"/> + </Console> + </Appenders> + <Loggers> + <Root level="info"> + <AppenderRef ref="main"/> + <AppenderRef ref="console"/> + </Root> + </Loggers> +</Configuration> |