Skip to content

Commit

Permalink
update lyric
Browse files Browse the repository at this point in the history
  • Loading branch information
leavelet committed Jul 6, 2022
1 parent d6e1996 commit f76c3b9
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 113 deletions.
18 changes: 12 additions & 6 deletions src/bin/make_dic.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
extern crate phoneme_lib;
use phoneme_lib::{phoneme_lib::*};
use phoneme_lib::*;

use std::collections::{HashMap};
use std::path::Path;
use std::collections::HashMap;
use std::env;
use std::path::Path;
use std::process::exit;

fn main() {
let args: Vec<String> = env::args().collect();
if args.len() == 1{
if args.len() == 1 {
println!("usage: make_dict <dictfile> <wordfile_dir>");
exit(1);
}
Expand All @@ -29,5 +29,11 @@ fn main() {
exit(1);
}
}
write_to_words(&dic, &WordList::Outer(Box::new(path_of_word.to_owned())), ".", None).unwrap();
}
write_to_words(
&dic,
&WordList::Outer(Box::new(path_of_word.to_owned())),
".",
None,
)
.unwrap();
}
6 changes: 5 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
pub mod download_dict;
pub mod lyric_process;
pub mod phoneme_lib;
//pub mod lyric_process;
pub mod tools;

pub use crate::phoneme_lib::*;
pub use crate::tools::WordList;
45 changes: 45 additions & 0 deletions src/lyric_process/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
use super::tools::*;
use lrc::Lyrics;
use std::collections::BTreeSet;
use std::fmt::Display;
use std::fs::{self, File};
use std::io::{self, Write};
use std::path::{Path, PathBuf};
extern crate lrc;

pub fn process_lrc<P>(file_name: P, write_to_file: Option<bool>) -> io::Result<WordList>
where
P: AsRef<Path> + Display,
{
let to_file = write_to_file.unwrap_or(true);
let raw_lyrc = fs::read_to_string(&file_name)?;
let lyc = match Lyrics::from_str(&raw_lyrc) {
Ok(data) => data,
Err(err) => {
println!("{:?}", err);
return Err(io::Error::new(
io::ErrorKind::NotFound,
format!("{} not found", &file_name),
));
}
};
let mut set = Box::new(BTreeSet::new());
for line in lyc.get_lines() {
for words in line.split(' ') {
set.insert(words.to_string());
}
}
if to_file {
let name = file_name.to_string();
let final_path_vec: Vec<&str> = name.split_whitespace().collect();
let final_path = final_path_vec[0].to_owned() + "_word_list.txt";
let mut word_list = File::create(&final_path)?;
for mut word in *set {
word.push('\n');
word_list.write_all(word.as_bytes())?;
}
Ok(WordList::Outer(Box::new(PathBuf::from(final_path))))
} else {
Ok(WordList::Inner(set))
}
}
198 changes: 92 additions & 106 deletions src/phoneme_lib/mod.rs
Original file line number Diff line number Diff line change
@@ -1,74 +1,49 @@
//! a lib to map word to phoneme
use super::tools::*;
use std::collections::BTreeSet;
use std::collections::{HashMap, HashSet};
use std::fmt::{Display, self};
use std::fs::{File, self};
use std::path::{Path, PathBuf};
use std::io::{self, BufRead};
use std::fs::{self, File};
use std::io;
use std::io::prelude::*;
use std::collections::BTreeSet;

///read lines from file
fn read_lines<P>(file_name: P) -> io::Result<io::Lines<io::BufReader<File>>>
where P: AsRef<Path>, {
let file = File::open(file_name)?;
Ok(io::BufReader::new(file).lines())
}
use std::path::Path;

///a dataset to store files, to deal with both inner BTreeSet and outer Path
///
/// both the two is stored in Box, making it easier to transfer
pub enum WordList{
///Path to file or directory, will detect automatically
///
///notice that we assume the path will only contain valid utf-8 char
Outer(Box<PathBuf>),
///a list of all the words to process, no duplicate item
Inner(Box<BTreeSet<String>>)
}

impl Display for WordList {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self {
WordList::Outer(path) => {write!(f, "{}", path.as_path().display())},
WordList::Inner(_) => {write!(f, "{}", "inner word list")}
}
}
}

///generate word list from files like processed lrc etc.,
///generate word list from files like processed lrc etc.,
///and sort them by lexicographical order
///
///
/// if you have .lrc files, process them using process_lrc from lyric_process mod
///
///
/// if write_to_file is true, will write to file_name_word_list.txt, which is defined by
/// the original library.
pub fn gen_word_list<P> (file_name: P, write_to_file: bool) -> io::Result<WordList>
where P: AsRef<Path> + std::fmt::Display + Clone {
pub fn gen_word_list<P>(file_name: P, write_to_file: bool) -> io::Result<WordList>
where
P: AsRef<Path> + std::fmt::Display + Clone,
{
//TODO: Change to simpler iter
let mut set = BTreeSet::new();
let file_lines = read_lines(&file_name)?;
file_lines.into_iter()
.map(|item|item.unwrap())
.map(|line| {
for word in line.split_whitespace() {
set.insert(word.to_owned());//is to_owned a must?
}
})
.for_each(drop);
file_lines
.into_iter()
.map(|item| item.unwrap())
.map(|line| {
for word in line.split_whitespace() {
set.insert(word.to_owned()); //is to_owned a must?
}
})
.for_each(drop);
if write_to_file {
let final_path = Path::new(&format!("{}_word_list.txt", file_name)).to_owned();
let mut output_file = File::create(&final_path)?;
set.iter()
.map(|word|output_file.write(word.as_bytes()))
.for_each(drop);
.map(|word| output_file.write(word.as_bytes()))
.for_each(drop);
Ok(WordList::Outer(Box::new(final_path)))
}
else {
} else {
Ok(WordList::Inner(Box::new(set)))
}
}

///generate dictionary from cmu dict or BigCiDian, the result will be stored to map dict.
pub fn make_dic(path: &Path, dic: &mut HashMap<String, String>) -> io::Result<()>{
pub fn make_dic(path: &Path, dic: &mut HashMap<String, String>) -> io::Result<()> {
println!("using dict {}", path.display());
let lines = read_lines(path)?;
for line in lines {
Expand All @@ -78,40 +53,43 @@ pub fn make_dic(path: &Path, dic: &mut HashMap<String, String>) -> io::Result<()
2 => {
dic.insert(all[0].trim().to_string(), all[1].to_string());
// println!("add {} {}", all[0].trim().to_string(), all[1].to_string())
}
_ => {println!("{:?} the line contains less than two words, skip", all);}
}
_ => {
println!("{:?} the line contains less than two words, skip", all);
}
}
}
Ok(())
}

fn match_and_write
( file: &Path,
dic: &HashMap<String, String>,
missing: &mut HashSet<String>,
out: &str
)
-> io::Result<()>
{
let lines = read_lines(file)
.expect(&format!("can't open {:?}!", file.display()));
fn match_and_write(
file: &Path,
dic: &HashMap<String, String>,
missing: &mut HashSet<String>,
out: &str,
) -> io::Result<()> {
let lines = read_lines(file).unwrap_or_else(|_| panic!("can't open {:?}!", file.display()));
//dataset0042_word2phonemes
println!("write to {}", &out);
let mut outfile = File::create(out).expect("invalid outfile!");
for word in lines {
let mut word = word?.trim().to_string();
//write to format as "word\tphonemes"
if word == "\n".to_string() {
if word == *"\n" {
continue;
}
match dic.get(&word) {
Some(phonemes) => {
word.push_str("\t");
word.push('\t');
let mut phonemes_str = phonemes.to_string();
phonemes_str.push_str("\n");
outfile.write_all(word.as_bytes()).expect("invalid outfile!");
outfile.write_all(phonemes_str.as_bytes()).expect("can't write!");
},
phonemes_str.push('\n');
outfile
.write_all(word.as_bytes())
.expect("invalid outfile!");
outfile
.write_all(phonemes_str.as_bytes())
.expect("can't write!");
}
None => {
println!("no word {} ", word);
missing.insert(word);
Expand All @@ -122,72 +100,78 @@ fn match_and_write
}

///look up word from `path_of_word` using dict `dic`
///
///
/// use path_of_word as the word list directory, or use a BTreeSet to store words
///
///
/// assume the word file is named by: datasetname_subfix,
/// and subfix have a default value of "_word_list"
///
///
/// the path of word should be a directory path
pub fn write_to_words<P>
(
dic:&HashMap<String, String>,
path_of_word: &WordList,
pub fn write_to_words<P>(
dic: &HashMap<String, String>,
path_of_word: &WordList,
path_of_output: P,
input_subfix: Option<&str>
)
-> io::Result<()>
where P: AsRef<Path>
input_subfix: Option<&str>,
) -> io::Result<()>
where
P: AsRef<Path>,
{
let mut missing = HashSet::new();
match path_of_word {
WordList::Inner(word_list) => {
let mut outfile = File::create(path_of_output.as_ref().join("dataset1_word2phonemes.txt"))?;
let mut outfile =
File::create(path_of_output.as_ref().join("dataset1_word2phonemes.txt"))?;
for word in word_list.iter() {
match dic.get(word) {
Some(phonemes) => {
let tmp = word.to_string() + "\t" + phonemes + "\n";
outfile.write_all(tmp.as_bytes()).expect("invalid outfile!");
},
}
None => {
println!("no word {} ", word);
missing.insert(word.clone());
}
};
}
},
}
WordList::Outer(path_of_word) => {
let data_subfix = input_subfix.unwrap_or("_word_list");
let file_meta = fs::metadata(path_of_word.as_path())?;
if file_meta.is_dir(){
if file_meta.is_dir() {
let all_file = fs::read_dir(path_of_word.as_ref())?;
for file in all_file {
//dataset0002_word_list
let file = file?;
if file.metadata().unwrap().is_dir(){
if file.metadata().unwrap().is_dir() {
continue;
}
let file_name = file.file_name().into_string().expect("not utf-8, invalid file name!");
let dataset_name =
match file_name.find(data_subfix){
Some(num) => {
file_name[..num].to_string()
}
None => {
println!(r#"file {} do not match the "{}" pattern "#, file_name, data_subfix);
continue;
}
};
let out = format!("{}/{}_word2phonemes.txt", path_of_word.display().to_string(), dataset_name);
let file_name = file
.file_name()
.into_string()
.expect("not utf-8, invalid file name!");
let dataset_name = match file_name.find(data_subfix) {
Some(num) => file_name[..num].to_string(),
None => {
println!(
r#"file {} do not match the "{}" pattern "#,
file_name, data_subfix
);
continue;
}
};
let out = format!(
"{}/{}_word2phonemes.txt",
path_of_word.display(),
dataset_name
);
match_and_write(&file.path(), dic, &mut missing, &out)?;
}
}
else{
} else {
let file_name = path_of_word.as_path();
let file_string = file_name.to_string_lossy();
let file_prefix = file_string.split('.').collect::<Vec<_>>();
let output_file_name = file_prefix[0].to_string() + "_word2phonemes.txt";
match_and_write(file_name, dic, &mut missing, &output_file_name)?;
match_and_write(file_name, dic, &mut missing, &output_file_name)?;
}
}
};
Expand All @@ -209,20 +193,22 @@ fn test_write_to_words() {
dic_mine.insert("hello".to_string(), "HH AH L OW".to_string());
word_list.insert("hello".to_string());
word_list.insert("foobar".to_string());
match write_to_words(&dic_mine, &WordList::Inner(Box::new(word_list)), dir.path(), None){
Err(err) => {println!("this {:?}", err)},
_ => {}
if let Err(err) = write_to_words(
&dic_mine,
&WordList::Inner(Box::new(word_list)),
dir.path(),
None,
) {
println!("this {:?}", err)
}
let mut output_file = File::open(dir.path().join("dataset1_word2phonemes.txt")).unwrap();
let mut output_thing = "".to_string();
output_file.read_to_string(&mut output_thing).unwrap();
assert_eq!(output_thing, "hello\tHH AH L OW\n");

let mut missing_file = File::open(dir.path().join("missing.txt")).unwrap();
let mut missing_thing = "".to_string();
missing_file.read_to_string(&mut missing_thing).unwrap();
assert_eq!(missing_thing, "foobar\n".to_string());

dir.close().unwrap();

}
}
Loading

0 comments on commit f76c3b9

Please sign in to comment.