package kr.co.iTrinityIndex.index.board;
import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import org.apache.log4j.Logger;
import org.snu.ids.kkma.index.Keyword;
import org.snu.ids.kkma.index.KeywordExtractor;
import org.snu.ids.kkma.index.KeywordList;
import kr.co.iTrinity4.analysis.Analyzer;
import kr.co.iTrinity4.analysis.standard.StandardAnalyzer;
import kr.co.iTrinity4.document.Document;
import kr.co.iTrinity4.facet.index.CategoryDocumentBuilder;
import kr.co.iTrinity4.facet.taxonomy.CategoryPath;
import kr.co.iTrinity4.facet.taxonomy.TaxonomyWriter;
import kr.co.iTrinity4.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import kr.co.iTrinity4.index.IndexWriter;
import kr.co.iTrinity4.index.IndexWriterConfig;
import kr.co.iTrinity4.index.Term;
import kr.co.iTrinity4.index.IndexWriterConfig.OpenMode;
import kr.co.iTrinity4.store.Directory;
import kr.co.iTrinity4.store.FSDirectory;
import kr.co.iTrinity4.util.Version;
import kr.co.iTrinityIndex.config.Config;
import kr.co.iTrinityIndex.config.IndexConfig;
import kr.co.iTrinityIndex.engine.database.ConnectionPool;
import kr.co.iTrinityIndex.exception.InternalException;
import kr.co.iTrinityIndex.index.DocumentIndex;
import kr.co.iTrinityIndex.index.IndexUtil;
import kr.co.iTrinityIndex.util.DicData;
import kr.co.iTrinityIndex.util.PropertyUtil;
import kr.co.iTrinityIndex.util.TextEdit;
import kr.co.iTrinityIndex.util.TextFilter;
import kr.co.shineware.nlp.komoran.core.analyzer.Komoran;
import kr.co.shineware.util.common.model.Pair;
public class NewsIndex extends DocumentIndex {
static Logger logger = Logger.getLogger(NewsIndex.class);
public static void main(String[] args) {
String type = "";
try {
// type = args[0].toLowerCase();
// 4. 문화 CODE=1021 //news
type = "all";
NewsIndex bi = new NewsIndex();
if (type.equals("") || type.equals("all")) {
bi.IndexBody("all");
} else if (type.equals("day")) {
bi.IndexBody("day");
}
} catch (Exception e) {
logger.error(e.toString());
}
}
@SuppressWarnings("unchecked")
@Override
public void IndexBody(String type) {
logger.info("NewsIndex(새소식) Start!!!");
// DB 데이터를
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
// 색인 데이터 write
TaxonomyWriter taxo = null;
IndexWriter iw = null;
// 쿼리 설정 가져오기
IndexConfig indexConfig = new IndexConfig();
HashMap sqlHm = indexConfig.getQueryInfo("BOARD");
String sql = "";
String fileQuery = "";
if (type.equals("all")) {
sql = TextEdit.trim(sqlHm.get("news_all"));
fileQuery = TextEdit.trim(sqlHm.get("fileQuery"));
} else if (type.equals("day")) {
sql = TextEdit.trim(sqlHm.get("news_day"));
}
try {
// DB연결 설정
conn = ConnectionPool.getConnection("default");
stmt = conn.createStatement();
logger.info("sql : " + sql);
rs = stmt.executeQuery(sql);
logger.info("rs 접속");
// 색인 WRITE 설정
String indexFolder = Config.getTrinityInxDB() + File.separator + "news";
String taxoFolder = Config.getTrinityTaxoDB() + File.separator + "news";
File IndexChkDir = new File(indexFolder);
File TaxoChkDir = new File(taxoFolder);
System.out.println( IndexChkDir.getPath() );
System.out.println( TaxoChkDir.getPath() );
Directory indexDir = FSDirectory.open(IndexChkDir);
Directory taxoDir = FSDirectory.open(TaxoChkDir);
Analyzer analyzer = new StandardAnalyzer(Version.ITRINITY_40);
IndexWriterConfig iwc = new IndexWriterConfig(Version.ITRINITY_40, analyzer);
if (type.equals("all")) {
iwc.setOpenMode(OpenMode.CREATE);
taxo = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
} else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
taxo = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE_OR_APPEND);
}
iw = new IndexWriter(indexDir, iwc);
// 메모리 단위로 설정해서 색인
iwc.setRAMBufferSizeMB(1024);
// 검색 옵션 초기화
String search_field[] = { "keyword_field_regdate", "keyword_author", "keyword_newscompany", "keyword_j_seq",
"unstored_field_subject", "unstored_field_desc", "unstored_field_attatch" };
String sort_field[] = { "keyword_field_subject", "keyword_field_regdate" };
String facet_Field[] = { "" };
// 기존의 옵션 삭제
Term term = new Term("search_key", "index_info");
iw.deleteDocuments(term);
// 옵션 재 등록
Document info_doc = setIndexInfo(search_field, sort_field, facet_Field);
iw.addDocument(info_doc);
int i = 1;
while (rs.next()) {
String j_seq = TextEdit.trim(rs.getString("J_SEQ"));
String field_subject = TextEdit.trim(rs.getString("FIELD_SUBJECT"));
String field_desc = TextEdit.trim(rs.getString("FIELD_DESC"));
String author = TextEdit.trim(rs.getString("AUTHOR"));
String newscompany = TextEdit.trim(rs.getString("NEWSCOMPANY"));
String category = TextEdit.trim(rs.getString("TAB_CATEGORY"));
String url = TextEdit.trim(rs.getString("URL"));
String field_regdate = TextEdit.trim(rs.getString("FIELD_REGDATE"));
// 첨부파일 처리용
//String atch_file_id = TextEdit.trim(rs.getString("ATCH_FILE_ID"));
// 첨부파일 처리
//String attach_list = "";
//String field_attatch = "";
//HashMap tempHm = getAttachData(atch_file_id, fileQuery, conn);
//attach_list = TextEdit.trim(tempHm.get("attach_list"));
//field_attatch = TextEdit.trim(tempHm.get("field_attach"));
HashMap hm = new HashMap();
hm.put("j_seq", j_seq);
hm.put("field_subject", field_subject);
hm.put("field_desc", field_desc);
hm.put("author", author);
hm.put("newscompany", newscompany);
hm.put("category", category);
hm.put("url", url);
hm.put("field_regdate", field_regdate);
// System.out.println(hm.toString());
// facet 필드 설정.
CategoryPath field_category = new CategoryPath("facet_keyword", "a");
List<CategoryPath> facetList = categoryPathArrayToList(field_category);
CategoryDocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo)
.setCategoryPaths(facetList);
Document doc = makeIndexing(hm);
categoryDocBuilder.build(doc);
iw.addDocument(doc);
if (i % 10000 == 0)
logger.info(i + "건 색인중...");
i++;
}
logger.info("전체 " + i + "건 색인 완료.");
taxo.commit();
iw.commit();
} catch (Exception e) {
logger.error("callbackIndexing error! : " + e.toString());
logger.error("query : " + sql);
} finally {
if (rs != null)
try {
rs.close();
} catch (Exception e) {
}
if (stmt != null)
try {
stmt.close();
} catch (Exception e) {
}
if (iw != null)
try {
iw.close();
} catch (IOException ex) {
}
if (taxo != null)
try {
taxo.close();
} catch (IOException ex) {
}
}
logger.info("NewsIndex(새소식) End!!!");
}
@Override
public Document makeIndexing(HashMap hm) throws Exception {
// TODO Auto-generated method stub
Document document = new Document();
try {
// 키값
String j_seq = TextEdit.trim(hm.get("j_seq"));
// 제목
String field_subject = TextEdit.trim(hm.get("field_subject"));
// 내용
String field_desc = TextEdit.trim(hm.get("field_desc"));
// 작성자
String author = TextEdit.trim(hm.get("author"));
// 소속
String newscompany = TextEdit.trim(hm.get("newscompany"));
// 카테고리
String category = TextEdit.trim(hm.get("category"));
// url
String url = TextEdit.trim(hm.get("url"));
// 날짜
String field_regdate = TextEdit.trim(hm.get("field_regdate"));
// 첨부파일
String field_attatch = TextEdit.trim(hm.get("field_attatch"));
String attach_list = TextEdit.trim(hm.get("attach_list"));
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// 키값
document.add(IndexUtil.Keyword("keyword_j_seq", j_seq));
document.add(IndexUtil.Stroed("stored_j_seq", j_seq));
// 제목색인
document.add(IndexUtil.Keyword("keyword_field_subject", field_subject));
document.add(IndexUtil.UnStored_nGramALL("unstored_field_subject", field_subject));
document.add(IndexUtil.Stroed("stored_field_subject", field_subject));
// 내용색인
document.add(IndexUtil.UnStored_nGram("unstored_field_desc", field_desc));
document.add(IndexUtil.Stroed("stored_field_desc", field_desc));
// 작성자
document.add(IndexUtil.Keyword("keyword_author", author));
document.add(IndexUtil.Stroed("stored_author", author));
// 소속
document.add(IndexUtil.Keyword("keyword_newscompany", newscompany));
document.add(IndexUtil.Stroed("stored_newscompany", newscompany));
// 카테고리
document.add(IndexUtil.Keyword("keyword_category", category));
document.add(IndexUtil.Stroed("stored_category", category));
// url
document.add(IndexUtil.Keyword("keyword_url", url));
document.add(IndexUtil.Stroed("stored_url", url));
// 날짜
document.add(IndexUtil.Keyword("keyword_field_regdate", field_regdate));
document.add(IndexUtil.Stroed("stored_field_regdate", field_regdate));
document.add(IndexUtil.Stroed("stored_view_regdate", field_regdate.replaceAll("-", ".")));
// 첨부파일
// document.add(IndexUtil.UnStored_nGram("unstored_field_attatch",
// field_attatch));
// document.add(IndexUtil.Stroed("stored_field_attatch", field_attatch));
// document.add(IndexUtil.Stroed("stored_attach_list", attach_list));
} catch (Exception e) {
throw new Exception("색인 문서 생성 에러 " + InternalException.getExceptionMessage(e));
}
return document;
}
public static HashMap getAttachData(String atch_file_id, String fileQuery, Connection conn) {
HashMap hm = new HashMap();
Statement stmt = null;
ResultSet rs = null;
String sql = fileQuery.replace("@@ATCH_FILE_ID", atch_file_id);
try {
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
String attach_list = "";
String field_attach = "";
while (rs.next()) {
String file_stre_cours = TextEdit.trim(rs.getString("FILE_STRE_COURS"));
String stre_file_nm = TextEdit.trim(rs.getString("STRE_FILE_NM"));
String orignl_file_nm = TextEdit.trim(rs.getString("ORIGNL_FILE_NM"));
String file_extsn = TextEdit.trim(rs.getString("FILE_EXTSN"));
String creat_dt = TextEdit.trim(rs.getString("CREAT_DT"));
String orgPath = file_stre_cours + File.separator + stre_file_nm;
String txtPath = Config.getTrinityTxt() + creat_dt + File.separator + atch_file_id + File.separator
+ stre_file_nm + ".txt";
File orgFileChk = new File(orgPath);
if (orgFileChk.exists()) {
String txtDir = Config.getTrinityTxt() + creat_dt + File.separator + atch_file_id;
File TxtPathChkDir = new File(txtDir);
if (!TxtPathChkDir.exists()) {
TxtPathChkDir.mkdirs();
}
try {
TextFilter.filter(orgPath, txtPath);
} catch (Exception e) {
System.out.println(e.toString());
}
}
attach_list = attach_list + file_extsn + "@@@" + orignl_file_nm + "###";
try {
field_attach = field_attach + TextEdit.getTextData(txtPath) + " " + orignl_file_nm;
} catch (Exception e) {
}
}
hm.put("attach_list", attach_list);
hm.put("field_attach", field_attach);
} catch (Exception e) {
logger.error(e.toString());
} finally {
if (rs != null)
try {
rs.close();
} catch (Exception e) {
}
if (stmt != null)
try {
stmt.close();
} catch (Exception e) {
}
}
return hm;
}
/*
* 색인삭제 : 생성된 IndexWriter로 삭제처리함, 색인 중에 삭제처리가 필요할때 사용 IndexWriter.close는 없음 -
* 색인단에서 처리
*/
public static boolean deleteDocumentOne(kr.co.iTrinity4.index.IndexWriter writer, String field_name,
String field_value) {
boolean result = false;
try {
writer.deleteDocuments(new Term(field_name, field_value));
result = true;
} catch (Exception e) {
logger.info("색인 삭제 에러");
logger.info(e.toString());
}
return result;
}
}
카테고리 없음