Software en Groovy para la recuperación de las conjugaciones de la RAE
This commit is contained in:
parent
d5e324b53a
commit
1afc05273e
26
software/build.gradle
Normal file
26
software/build.gradle
Normal file
|
@ -0,0 +1,26 @@
|
|||
apply plugin: 'groovy'
|
||||
apply plugin: 'application'
|
||||
apply plugin: 'maven'
|
||||
|
||||
group = 'com.acceso'
|
||||
version = '0.1-SNAPSHOT'
|
||||
|
||||
/*
|
||||
* Mapeando la configuración 'groovy' de Gradle con el scope 'runtime' de maven. De esta forma
|
||||
* al generar el pom considerará como dependencias a incluir aquellas asociadas con la configuración
|
||||
* 'groovy'. Por defecto considera únicamente las que incorpora el plugin 'java' de gradle.
|
||||
*/
|
||||
conf2ScopeMappings.addMapping(MavenPlugin.COMPILE_PRIORITY, configurations.groovy, org.gradle.api.artifacts.maven.Conf2ScopeMappingContainer.COMPILE)
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
mavenCentral()
|
||||
}
|
||||
|
||||
dependencies {
|
||||
groovy group: 'org.codehaus.groovy', name: 'groovy-all', version: '1.8.6'
|
||||
groovy group: 'log4j', name: 'log4j', version: '1.2.16'
|
||||
groovy 'org.jsoup:jsoup:1.6.3'
|
||||
}
|
||||
|
||||
mainClassName = 'com.acceso.lemarios.ExtractInflectedForms'
|
1
software/settings.gradle
Normal file
1
software/settings.gradle
Normal file
|
@ -0,0 +1 @@
|
|||
rootProject.name = 'lemarios'
|
|
@ -0,0 +1,102 @@
|
|||
package com.acceso.lemarios
|
||||
|
||||
import org.jsoup.nodes.Document
|
||||
import org.jsoup.Jsoup
|
||||
import org.jsoup.select.Elements
|
||||
|
||||
class ExtractInflectedForms {
|
||||
|
||||
private final static String OUTPUT_FILE = '/tmp/inflectedForms.txt'
|
||||
private final static String VERB_FILE = '/com/acceso/lemarios/verbos-espanol.txt'
|
||||
private final static String LEMA_SEARCH_URL = 'http://lema.rae.es/drae/srv/search?val='
|
||||
private final static String INFLECTED_FORMS_URL = 'http://lema.rae.es/drae/srv/'
|
||||
private final static List STOP_WORDS = ['me', 'te', 'se', 'nos', 'os', 'etc.']
|
||||
|
||||
private File outputFile = new File(OUTPUT_FILE)
|
||||
|
||||
public void execute() {
|
||||
List<String> inflected = []
|
||||
List<String> verbs = loadVerbs()
|
||||
verbs.eachWithIndex { verb, index ->
|
||||
println "[${index}, ${verbs.size()}]"
|
||||
inflected.addAll(extract(verb))
|
||||
}
|
||||
|
||||
//inflected = inflected.sort().unique()
|
||||
inflected.removeAll(STOP_WORDS)
|
||||
|
||||
save(inflected)
|
||||
}
|
||||
|
||||
private List<String> extract(String verb) {
|
||||
List<String> results = []
|
||||
List<String> inflectedUrls = getInflectedUrls(verb)
|
||||
|
||||
//println "[${verb}] (${inflectedUrls.size()})"
|
||||
//inflectedUrls.each { println "\t${it}" }
|
||||
|
||||
results = inflectedUrls.collect {
|
||||
try {
|
||||
return getInflectedForms(verb, it)
|
||||
} catch(Exception e) {
|
||||
println "Error [${verb}] [${it}]: ${e}"
|
||||
return []
|
||||
}
|
||||
}.flatten()
|
||||
|
||||
//println "Declinaciones " + results.join('--')
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
private List<String> getInflectedForms(String verb, String inflectedUrl) {
|
||||
Document doc = Jsoup.connect(inflectedUrl).get()
|
||||
Elements elements = doc.select("p[class=z]")
|
||||
|
||||
List<String> results = elements.collect { element ->
|
||||
element.text().split(',|/| o | u | ').collect{it.trim()}.findAll{it}
|
||||
}.flatten()
|
||||
|
||||
if(!results) {
|
||||
println "[0 inflected forms for ${verb}]"
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// Devuelve las urls con las declinaciones
|
||||
private List<String> getInflectedUrls(String verb) {
|
||||
List<String> result = []
|
||||
|
||||
try {
|
||||
Document doc = Jsoup.connect(LEMA_SEARCH_URL + URLEncoder.encode(verb, 'ISO-8859-1')).get()
|
||||
Elements elements = doc.select("a img[alt^=Ver conjugaci]")
|
||||
|
||||
result = elements.collect { element -> INFLECTED_FORMS_URL + element.parent().attr('href') }
|
||||
} catch (Exception e) {
|
||||
println "Error [${verb}] [${it}]: ${e}"
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
private void save(List<String> forms) {
|
||||
forms.each { outputFile << "${it}\n" }
|
||||
}
|
||||
|
||||
private List<String> loadVerbs() {
|
||||
List<String> result = []
|
||||
InputStream i = getClass().getResourceAsStream(VERB_FILE)
|
||||
|
||||
if(i) {
|
||||
i.eachLine { line -> if(line.contains('í')) result << line }
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
ExtractInflectedForms e = new ExtractInflectedForms()
|
||||
e.execute()
|
||||
}
|
||||
}
|
10783
software/src/main/resources/com/acceso/lemarios/verbos-espanol.txt
Normal file
10783
software/src/main/resources/com/acceso/lemarios/verbos-espanol.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user