Software en Groovy para la recuperación de las conjugaciones de la RAE

This commit is contained in:
Jesus Lanchas 2012-08-24 14:50:51 +02:00
parent d5e324b53a
commit 1afc05273e
4 changed files with 10912 additions and 0 deletions

26
software/build.gradle Normal file
View File

@ -0,0 +1,26 @@
apply plugin: 'groovy'
apply plugin: 'application'
apply plugin: 'maven'
group = 'com.acceso'
version = '0.1-SNAPSHOT'
/*
* Mapeando la configuración 'groovy' de Gradle con el scope 'runtime' de maven. De esta forma
* al generar el pom considerará como dependencias a incluir aquellas asociadas con la configuración
* 'groovy'. Por defecto considera únicamente las que incorpora el plugin 'java' de gradle.
*/
conf2ScopeMappings.addMapping(MavenPlugin.COMPILE_PRIORITY, configurations.groovy, org.gradle.api.artifacts.maven.Conf2ScopeMappingContainer.COMPILE)
repositories {
mavenLocal()
mavenCentral()
}
dependencies {
groovy group: 'org.codehaus.groovy', name: 'groovy-all', version: '1.8.6'
groovy group: 'log4j', name: 'log4j', version: '1.2.16'
groovy 'org.jsoup:jsoup:1.6.3'
}
mainClassName = 'com.acceso.lemarios.ExtractInflectedForms'

1
software/settings.gradle Normal file
View File

@ -0,0 +1 @@
rootProject.name = 'lemarios'

View File

@ -0,0 +1,102 @@
package com.acceso.lemarios
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import org.jsoup.select.Elements
class ExtractInflectedForms {
private final static String OUTPUT_FILE = '/tmp/inflectedForms.txt'
private final static String VERB_FILE = '/com/acceso/lemarios/verbos-espanol.txt'
private final static String LEMA_SEARCH_URL = 'http://lema.rae.es/drae/srv/search?val='
private final static String INFLECTED_FORMS_URL = 'http://lema.rae.es/drae/srv/'
private final static List STOP_WORDS = ['me', 'te', 'se', 'nos', 'os', 'etc.']
private File outputFile = new File(OUTPUT_FILE)
public void execute() {
List<String> inflected = []
List<String> verbs = loadVerbs()
verbs.eachWithIndex { verb, index ->
println "[${index}, ${verbs.size()}]"
inflected.addAll(extract(verb))
}
//inflected = inflected.sort().unique()
inflected.removeAll(STOP_WORDS)
save(inflected)
}
private List<String> extract(String verb) {
List<String> results = []
List<String> inflectedUrls = getInflectedUrls(verb)
//println "[${verb}] (${inflectedUrls.size()})"
//inflectedUrls.each { println "\t${it}" }
results = inflectedUrls.collect {
try {
return getInflectedForms(verb, it)
} catch(Exception e) {
println "Error [${verb}] [${it}]: ${e}"
return []
}
}.flatten()
//println "Declinaciones " + results.join('--')
return results
}
private List<String> getInflectedForms(String verb, String inflectedUrl) {
Document doc = Jsoup.connect(inflectedUrl).get()
Elements elements = doc.select("p[class=z]")
List<String> results = elements.collect { element ->
element.text().split(',|/| o | u | ').collect{it.trim()}.findAll{it}
}.flatten()
if(!results) {
println "[0 inflected forms for ${verb}]"
}
return results
}
// Devuelve las urls con las declinaciones
private List<String> getInflectedUrls(String verb) {
List<String> result = []
try {
Document doc = Jsoup.connect(LEMA_SEARCH_URL + URLEncoder.encode(verb, 'ISO-8859-1')).get()
Elements elements = doc.select("a img[alt^=Ver conjugaci]")
result = elements.collect { element -> INFLECTED_FORMS_URL + element.parent().attr('href') }
} catch (Exception e) {
println "Error [${verb}] [${it}]: ${e}"
}
return result
}
private void save(List<String> forms) {
forms.each { outputFile << "${it}\n" }
}
private List<String> loadVerbs() {
List<String> result = []
InputStream i = getClass().getResourceAsStream(VERB_FILE)
if(i) {
i.eachLine { line -> if(line.contains('í')) result << line }
}
return result
}
public static void main(String[] args) {
ExtractInflectedForms e = new ExtractInflectedForms()
e.execute()
}
}

File diff suppressed because it is too large Load Diff