Commit 55cae5ee authored by Knut Wenzig's avatar Knut Wenzig
Browse files

Aktualisierungen, u.a. neue Titelseite

parent e3a6a7fd
......@@ -28,7 +28,9 @@
\usepackage{colortbl}
\definecolor{DIWgruen}{cmyk}{1,0.2,0.55,0}
\definecolor{DIWdunkelblau}{cmyk}{1,0.4,0,0.15}
\definecolor{DIWhellgrau}{cmyk}{0.12,0,0,0.17}
\definecolor{DIWdunkelrot}{cmyk}{0.3,1,0.9,0}
\usepackage{xspace}
\usepackage{graphicx}
......@@ -96,6 +98,18 @@
\newcommand{\varsubsection}[2]{\subsection[{#1 -- #2}]{#1\color{black}\normalfont{ -- #2}\color{DIWgruen}}}
% Titelseite
\addtokomafont{title}{\raggedright\color{DIWdunkelblau}}
%\addtokomafont{author}{\raggedright\setlength{\tabcolsep}{0pt}}
%\addtokomafont{date}{\raggedright}
\newcommand{\sethyphenation}[3][]{%
\sbox0{\begin{otherlanguage}[#1]{#2}
\hyphenation{#3}\end{otherlanguage}}}
\sethyphenation{german}{Wirt-schafts-zweige}
\sethyphenation[variant=usmax]{english}{Wirt-schafts-zweige Stu-di-en-fä-cher}
\begin{document}
\pagestyle{scrheadings}
......
# benoetigt wird:
# - aktuelles R-Paket dortools
# https://gitlab.soep.de/kwenzig/dortools/wikis/home
#
# - R-Paket rmarkdown
#
# - aktuelles Repository additional metadata
# https://gitlab.soep.de/kwenzig/additionalmetadata
library(xlsx)
library(foreign)
library(dortools)
......@@ -5,59 +14,73 @@ library(dortools)
# Das steuert alles
##############################
study <- "soep-is"
distribution <- "2013"
dataset.name <- "bioparen"
version <- "2013"
language <- "de"
study <- "piaac-l"
distribution <- "2014"
dataset.name <- "ZA5989_Household_v1-0-0" # ggfs. auskommentieren, falls mit source aufgerufen
version <- "2014"
language <- "en"
allow.markdown <- TRUE
collapse.variable.groups <- FALSE
collapse.variable.groups <- TRUE
calc.abs.frequencies <- TRUE # TRUE: show only all value labels without frequencies
omit.unlabeled.values <- TRUE
omit.unlabeled.values <- FALSE # if only_labeled_vals==1
optimize.tables <- TRUE
restrict.questions <- TRUE # Anzeige der Fragen begrenzen (unique Fragen im letzten Jahr)
do.xelatex <- TRUE # LaTeX-Lauf starten
rename.pdf <- TRUE # pdf>questionnaire umbenennen do.xelatex=T
running.title.only <- FALSE # Kolumnentitel: with.title muss TRUE sein
with.footer <- TRUE # Fußzeile (questionnaire und Seite)
first.page.number <- 1
# Pfade
# root enthaelt Oder "meta", "datasets" und "questionnaires"
rootpath <- "D:/lokal/isdatadoku/"
rootpath <- "D:/lokal/twinlife-datenmanagement/metadata/"
rootpath <- "D:/lokal/min_wage/"
rootpath <- "D:/lokal/core-doku/v31/"
rootpath <- "D:/lokal/FiD-Integration/"
rootpath <- "D:/lokal/dortest/t1/"
rootpath <- "D:/lokal/piaac-l-2014/Metadata/"
# Ordner im im Stata-12-File liegt (saveold!)
datapath <- "//hume/soep-data/DATA/soep30_en/stata/"
datapath <- "D:/lokal/min_wage/datasets/Bus2015_Mindestlohn/"
datapath <- "//hume/soep-data/DATA/soep31_en/stata/"
datapath <- "S:/DATA2/minimal_wage/"
datapath <- "S:/DATA/soep30_en_l/stata/"
datapath <- "S:/DATA2/SOEP-RS/soep-bus/"
datapath <- "S:/DATA2/SOEP-IS/SOEP-IS 2013 Generierung HiWi/Data/finaldata/DE/"
datapath <- "//hume/abt/abt-sop/MA/fams/Janine/Glückstrend Datensätze/"
datapath <- "//hume/soep-data/DATA2/SOEPcore/v31_FiDintegration/20_integrated/"
datapath <- "//hume/soep-data/DATA2/SOEP-IS/SOEP-IS_2013_release/download/soep-is.2013_stata_de/"
datapath <- "D:/lokal/dortest/t1/datasets/auto/"
datapath <- "S:/MA/kwenzig/piaac_l/"
# Ordner > Namen des Datensatzes > codebook.csv
datasetspath <- "H:/clone/glueckstrend/datasets/"
datasetspath <- "D:/lokal/FiD-Integration/"
datasetspath <- "D:/lokal/isdatadoku/datasets/"
codebookcsvpath <- paste0(datasetspath, dataset.name, "/")
codebookcsvpath <- paste0(rootpath, "datasets/", dataset.name, "/")
# generations.csv.path: vector mit Pfaden zu generations
# vector kann auch einzelner Pfad sein
# generations.csv muss vorhanden sein, kann auch leer sein oder nicht "passen"
generations.csv.path <- codebookcsvpath
generations.csv.path <- "D:/lokal/isdatadoku/questionnaires/soep-is-2013-a/"
generations.csv.path <- "D:/lokal/min_wage/questionnaires/min_wage_1509/"
generations.csv.path <- codebookcsvpath
# sucht alle generations.csv unterhalb von root
generations.csv.path <- list.files(path=rootpath, pattern="{1}^generations.csv$",
recursive=TRUE, full.names = TRUE)
generations.csv.path <- gsub("/generations.csv", "/", generations.csv.path, fixed=TRUE)
# questions.csv.path: vector mit Pfaden zu questions&answers
# questions.csv muss vorhanden sein, kann auch leer sein oder nicht "passen"
questions.csv.path <- "D:/lokal/soep-bus/questionnaires/soep-bus-2015/"
questions.csv.path <- list.files(path=rootpath, pattern="{1}^questions.csv$",
recursive=TRUE, full.names = TRUE)
questions.csv.path <- gsub("/questions.csv", "/", questions.csv.path, fixed=TRUE)
# variables.csv.path: vector mit Pfaden zu variables und variable_categories
variables.csv.path <- codebookcsvpath
# sucht rekursiv nach Pfaden, in denen variables.csv enthalten ist
variables.csv.path <- list.files(path=rootpath, pattern="{1}^variables.csv$",
recursive=TRUE, full.names = TRUE)
variables.csv.path <- gsub("/variables.csv", "/", variables.csv.path, fixed=TRUE)
# log_vars.csv.path <- vector mit Pfaden zu logical_variables
log_vars.csv.path <- generations.csv.path
log_vars.csv.path <- questions.csv.path
log_vars.csv.path <- codebookcsvpath
# sucht rekursiv nach Pfaden, in denen logical_variables.csv enthalten ist
log_vars.csv.path <- list.files(path=questionnaires.path, pattern="{1}^logical_variables.csv$",
recursive=TRUE, full.names = TRUE)
log_vars.csv.path <- gsub("/logical_variables.csv", "/", log_vars.csv.path, fixed=TRUE)
# questionnaires.csv
meta.path <- paste0(rootpath,"meta/")
addpath <- "D:/lokal/additionalmetadata/" # additional metadata
......@@ -115,7 +138,7 @@ codebook.csv$sort <- 1:nrow(codebook.csv)
# Variablenlisten
dvars <- dataset$variable
cvars <-codebook.csv$variable[codebook.csv$variable!=""]
cvars <- codebook.csv$variable[codebook.csv$variable!=""]
if(length(dvars)==length(cvars)){
if(!all(cvars==dvars)){
warning("Variablen in Datensatz und codebook.csv haben unterschiedliche Reihenfolge (oder gleiche Anzahl mit unterschiedlichen Variablen, s.u.).")
......@@ -125,7 +148,7 @@ if(length(dvars)==length(cvars)){
# Variablen im Datensatz, aber nicht im codebook
dplusvars <- dvars[!is.element(dvars, cvars)]
if(length(dplusvars)>0){
dplusvars
cat(dplusvars)
warning("Variablen in Datensatz aber nicht in codebook.csv")
dplusvars <- as.data.frame(dplusvars, stringsAsFactors = FALSE)
cplus.csv <- merge(codebook.csv, dplusvars,
......@@ -136,15 +159,29 @@ if(length(dplusvars)>0){
cplus.csv$version <- version
cplus.csv$section <- "ADD TO CODEBOOK.CSV"
cplus.csv$sort <- 100000
cplus.csv[,c("waves","years")] <- ""
cplus.csv[,c("waves","years", "group", "only_labeled_vals")] <- ""
codebook.csv <- rbind(codebook.csv, cplus.csv)
}
# Variablen im Codebook, aber nicht im Datensatz
cplusvars <- cvars[!is.element(cvars, dvars)]
if(length(cplusvars)>0){
cplusvars
cat(cplusvars)
warning("Variablen in codebook.csv aber nicht in Datensatz")
# überzählige Zeilen löschen
codebook.csv <- codebook.csv[!is.element(codebook.csv$variable, cplusvars), ]
codebook.csv <- codebook.csv[1:(nrow(codebook.csv)+1), ]
codebook.csv$study[nrow(codebook.csv)] <- study
codebook.csv$dataset[nrow(codebook.csv)] <- dataset.name
codebook.csv$version[nrow(codebook.csv)] <- version
codebook.csv$section[nrow(codebook.csv)] <- "SUPERFLOUS IN CODEBOOK.CSV"
codebook.csv$sectext[nrow(codebook.csv)] <-
paste0("There are variables mentioned in codebook.csv, which cannot be found in the dataset. ",
"You should delete those lines from codebook.csv or add them to the dataset. ",
"The variables are: ",
paste0(cplusvars, collapse = ", "), ".")
codebook.csv$sort[nrow(codebook.csv)] <- 100000
codebook.csv[nrow(codebook.csv),c("variable", "waves","years", "group", "only_labeled_vals")] <- ""
}
......@@ -158,11 +195,15 @@ variables.csv <- multiCSVopen(type="variables",
path=variables.csv.path, root="")
variable_categories.csv <- multiCSVopen(type="variable_categories",
path=variables.csv.path, root="")
path=codebookcsvpath, root="")
logical_variables.csv <- multiCSVopen(type="logical_variables",
path=log_vars.csv.path, root="")
# aus questionnaires.csv wird period zur Auswahl der Fragen benötigt
questionnaires.csv <- read.csv(paste0(meta.path, "questionnaires.csv"),
colClasses="character",encoding="UTF-8")
# titles.csv ist ein selbst administrierter Datensatz mit Informationen
# über die Titelseite (eine Zeile pro Datensatz)
titles.csv <- read.csv(paste0(addpath, "titles.csv"),
......@@ -194,13 +235,18 @@ codebook.csv <- codebook.csv[order(codebook.csv$sort), ]
# gen.path: soll alle input-variablen zum dataset enthalten
# Schritt 1. gen.path mit datensatz als input und output
gen.path <- variables.csv[, GetIDs("variables")]
gen.path <- codebook.csv[codebook.csv$variable!="", GetIDs("variables")]
gen.path <- cbind(gen.path,gen.path)
names(gen.path) <- GetColumns("generations")
# Schritt 2: generations unten ran
generations.csv <- multiCSVopen(type="generations",
path=generations.csv.path, root="")
# nur vollständig gefüllte generations-Zeilen weiterverarbeiten
if(nrow(generations.csv)>0) {
generations.csv <- generations.csv[rowSums(as.data.frame(generations.csv==""))==0, ]
}
gen.path <- rbind(gen.path, generations.csv)
# generations.out wird
generations.out <- generations.csv
......@@ -218,11 +264,21 @@ while(nrow(generations.out)>0) {
generations.out <- generations.out[,c("input_study", "input_dataset", "input_version",
"input_variable","output_study", "output_dataset",
"output_version", "output_variable")]
# generations-Treffer an gen.path appenden
gen.path <- rbind(gen.path,generations.out)
print(paste0(nrow(generations.out), " rows added to gen.path"))
}
quest2var <- merge(variables.csv,gen.path,
# gen.path einschränken: auf output-Seite sollen nur noch Variablen aus
# aktuellen Datensatz stehen
gen.path <- gen.path[gen.path$output_study==study &
gen.path$output_dataset==dataset.name &
gen.path$output_version==version, ]
quest2var <- merge(variables.csv[variables.csv$study==study &
variables.csv$dataset==dataset.name &
variables.csv$version==version, ],
gen.path,
by.x=c("study", "dataset", "version", "variable"),
by.y=c("output_study", "output_dataset",
"output_version", "output_variable"),
......@@ -238,6 +294,7 @@ quest2var <- quest2var[quest2var$questionnaire!="", ]
questions.csv <- multiCSVopen(type="questions",
path=questions.csv.path, root="")
questions.csv$sort <- 1:nrow(questions.csv)
# rootquestions zum ranmergen aufbereiten und umbenennen
rootquestion <- questions.csv[questions.csv$item=="", c("study", "questionnaire", "question", "text", "text_de")]
......@@ -249,7 +306,7 @@ rootquestion <- NULL
# variablenfrage ranmergen, falls gleich rootquestion: leeren
quest2var <- merge(quest2var,
questions.csv[ ,c("study", "questionnaire", "question",
"item", "text", "text_de", "concept")],
"item", "text", "text_de", "concept", "sort")],
by=c("study", "questionnaire", "question", "item"),
all.x=TRUE)
quest2var$text[quest2var$roottext_de==quest2var$text_de] <- ""
......@@ -292,32 +349,44 @@ TitleLaTeX <- function(study, distribution, dataset, version) {
title.row <- min(which((titles.csv$study==study & titles.csv$distribution==distribution &
titles.csv$dataset==dataset & titles.csv$version==version) ==
TRUE))
title <- TeXifyStr(titles.csv$Title[title.row])
title <- titles.csv$Title[title.row]
seriesno <- TeXifyStr(titles.csv$No[title.row])
if(nchar(seriesno)>0){
seriesno <- paste0("SOEP Survey Paper ", seriesno)
}
date <- TeXifyStr(titles.csv$date[title.row])
author <- TeXifyStr(titles.csv$Autor[title.row])
author <- titles.csv$Autor[title.row]
publishers <- TeXifyStr(titles.csv$publishers[title.row])
doi <- TeXifyStr(titles.csv$zuDOI[title.row])
thanks.latex <- ""
doi <- titles.csv$zuDOI[title.row]
bottomline <- ""
if(nchar(doi)>0){
thanks.latex <- paste0("\\thanks{This file is part of a collection, which is released with doi:",
doi, "}\n")
bottomline <- paste0("This file is part of a collection, which is released with doi:",
doi, ".\n")
}
studies.csv <- read.csv(paste0(meta.path,"studies.csv"),
colClasses="character", encoding="UTF-8")
studies.csv <- studies.csv[studies.csv$study==study, ]
study.label <- studies.csv$label[1]
if(is.null(study.label)){
study.label <- ""
}
topleft <- paste0(study.label, " ", distribution)
title.latex <- ""
title.latex <- TitlePageLaTeX(language=language, topleft=topleft, topright=dataset,
title=title, subtitle="", author=author, date="",
bottomline=bottomline, abstract="", study=study, objectid=dataset,
with.footer, running.title.only, first.page.number)
if(language=="en"){
title.latex <- paste0(title.latex,
title.latex2 <- paste0(title.latex,
"\\resetdefaultlanguage{english}\n")
}
title.latex <- paste0(title.latex, paste0("\\ihead{", title, "}\n",
"\\ifoot{", seriesno, "}\n",
"\\title{", title, thanks.latex, "}\n",
"\\date{", date, "}\n",
"\\author{", author, "}\n",
"\\publishers{", publishers, "}\n",
"\\maketitle\n"))
#title.latex2 <- paste0(title.latex, paste0("\\ihead{", title, "}\n",
# "\\ifoot{", seriesno, "}\n",
# "\\title{", title, thanks.latex, "}\n",
# "\\date{", date, "}\n",
# "\\author{", author, "}\n",
# "\\publishers{", publishers, "}\n",
# "\\maketitle\n"))
return(title.latex)
}
......@@ -339,10 +408,24 @@ VarQuestionsLaTeX <- function(variable) {
"variable", "roottext",
"roottext_de", "text", "text_de",
"input_study", "questionnaire",
"question", "item", "concept")],
"question", "item", "concept", "sort")],
by=c("study", "dataset", "version", "variable"),
all=FALSE)
if(nrow(q.tab)>0) {
q.tab <- merge(q.tab, questionnaires.csv[,c("study", "questionnaire", "period")],
by = c("study", "questionnaire"),
all.x = TRUE, all.y = FALSE)
# anzuzeigende Fragen sortieren
q.tab <- q.tab[order(q.tab$period, q.tab$questionnaire, q.tab$sort), ]
if(restrict.questions){
# erste und letzte period auswählen
# q.tab <- q.tab[q.tab$period==max(q.tab$period) | q.tab$period==min(q.tab$period), ]
# nur letzte period auswählen
q.tab <- q.tab[q.tab$period==max(q.tab$period), ]
# identische roottext/text
q.tab <- q.tab[!duplicated(q.tab[, c("roottext", "text")], fromLast=TRUE), ]
}
q.tab$textdelim <- ""
q.tab$textdelim[q.tab$text!=""] <- " // "
q.tab$itemdelim <- ":"
......@@ -376,6 +459,7 @@ VariableLaTeX <- function(variable) {
# - Absatz mit Prosa
# - Absatz mit references
cat(variable) # debug
# variable <- "k_cost" # debug
# variablenname als LaTeX-String
var.latex <- TeXifyStr(codebook.csv$variable.print[codebook.csv$variable==variable])
......@@ -434,11 +518,6 @@ VariableLaTeX <- function(variable) {
categories.frequencies <- categories.frequencies[categories.frequencies$label!="",]
}
}
if(optimize.tables){
if(nrow(categories.frequencies)>30){
categories.frequencies <- categories.frequencies[1:30,]
}
}
# LaTeX-Snippet varsection.latex wird aufgebaut
varsection.latex <- heading.latex
......@@ -446,7 +525,7 @@ VariableLaTeX <- function(variable) {
varsection.latex <- paste0(varsection.latex, VarQuestionsLaTeX(variable))
# Falls Häufigkeitstabelle nicht leer, wird sie erzeugt und an das
# Ausgabe-Snippet angefügt.
if (dim(categories.frequencies)[1] > 0) {
if (nrow(categories.frequencies) > 0) {
# JSON-ähnliche Struktur wird in einen data.frame umgewandelt
# cat.latex <- fromJSON(paste0("{", categories, "}"))
# cat.latex <- data.frame(names(cat.latex),cat.latex)
......@@ -460,6 +539,22 @@ VariableLaTeX <- function(variable) {
cat.latex$abs <- abs(as.numeric(cat.latex$value))
cat.latex <- cat.latex[order(cat.latex$vz, cat.latex$abs),
c("value", "label", "frequency")]
if(optimize.tables){
cat.rows <- nrow(cat.latex)
cat(cat.rows)
if(cat.rows>30){
rows.omitted <- cat.rows-30
frequency.omitted <- as.character(sum(as.numeric(cat.latex[16:(cat.rows-15),
"frequency"])))
cat.latex <- rbind(cat.latex[1:15,],
data.frame(value="...",
label=paste0("(", rows.omitted, " ",
l10n("zeilen.unterdrueckt",language), ")"),
frequency=frequency.omitted),
cat.latex[(cat.rows-14):cat.rows,])
}
}
# Zeilen ohne Wertelabel löschen
# cat.latex <- cat.latex[cat.latex$label!="", ]
# Snippet erzeugen und an ausgabe-Snippet anfügen
......@@ -522,6 +617,7 @@ SectionLaTeX <- function(section) {
# Returns:
# LaTeX-Snippets des kompletten Abschnitts
cat(section)
# Überschrift des Abschnitts als LaTeX-String
section.latex <- TeXifyStr(section)
section.latex <- paste0("\\section{", section.latex, "}\n\n")
......@@ -577,11 +673,14 @@ if(collapse.variable.groups) {
codebook.csv$is.group <- as.logical(codebook.csv$is.group)
codebook.csv$variable.print[codebook.csv$is.group] <-
codebook.csv$group[codebook.csv$is.group]
codebook.csv$label[codebook.csv$is.group & codebook.csv$grouplabel!=""] <-
codebook.csv$grouplabel[codebook.csv$is.group & codebook.csv$grouplabel!=""]
codebook.csv$label[codebook.csv$is.group] <-
paste0(codebook.csv$grouplabel[codebook.csv$is.group], " [variable group]")
paste0(codebook.csv$label[codebook.csv$is.group], " [",
l10n("generischer.Name", language), "]")
codebook.csv$variabletext[codebook.csv$is.group] <-
paste0(codebook.csv$variabletext[codebook.csv$is.group],
" [Given information may correspond only to first variable in group.]")
" [", l10n("generische.Information",language), "]")
}
# Ausgabe des Titelsnipptes in Datei
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment