본문 바로가기

카테고리 없음

package kr.co.iTrinityIndex.index.board;

import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;

import org.apache.log4j.Logger;
import org.snu.ids.kkma.index.Keyword;
import org.snu.ids.kkma.index.KeywordExtractor;
import org.snu.ids.kkma.index.KeywordList;

import kr.co.iTrinity4.analysis.Analyzer;
import kr.co.iTrinity4.analysis.standard.StandardAnalyzer;
import kr.co.iTrinity4.document.Document;
import kr.co.iTrinity4.facet.index.CategoryDocumentBuilder;
import kr.co.iTrinity4.facet.taxonomy.CategoryPath;
import kr.co.iTrinity4.facet.taxonomy.TaxonomyWriter;
import kr.co.iTrinity4.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import kr.co.iTrinity4.index.IndexWriter;
import kr.co.iTrinity4.index.IndexWriterConfig;
import kr.co.iTrinity4.index.Term;
import kr.co.iTrinity4.index.IndexWriterConfig.OpenMode;
import kr.co.iTrinity4.store.Directory;
import kr.co.iTrinity4.store.FSDirectory;
import kr.co.iTrinity4.util.Version;
import kr.co.iTrinityIndex.config.Config;
import kr.co.iTrinityIndex.config.IndexConfig;
import kr.co.iTrinityIndex.engine.database.ConnectionPool;
import kr.co.iTrinityIndex.exception.InternalException;
import kr.co.iTrinityIndex.index.DocumentIndex;
import kr.co.iTrinityIndex.index.IndexUtil;
import kr.co.iTrinityIndex.util.DicData;
import kr.co.iTrinityIndex.util.PropertyUtil;
import kr.co.iTrinityIndex.util.TextEdit;
import kr.co.iTrinityIndex.util.TextFilter;
import kr.co.shineware.nlp.komoran.core.analyzer.Komoran;
import kr.co.shineware.util.common.model.Pair;

public class NewsIndex extends DocumentIndex {

   static Logger logger = Logger.getLogger(NewsIndex.class);

   public static void main(String[] args) {
      String type = "";

      try {
         // type = args[0].toLowerCase();
//         4. 문화      CODE=1021   //news
         type = "all";
         NewsIndex bi = new NewsIndex();
         if (type.equals("") || type.equals("all")) {
            bi.IndexBody("all");
         } else if (type.equals("day")) {
            bi.IndexBody("day");
         }
      } catch (Exception e) {
         logger.error(e.toString());
      }

   }

   @SuppressWarnings("unchecked")
   @Override
   public void IndexBody(String type) {
      logger.info("NewsIndex(새소식) Start!!!");
      // DB 데이터를
      Connection conn = null;
      Statement stmt = null;
      ResultSet rs = null;

      // 색인 데이터 write
      TaxonomyWriter taxo = null;
      IndexWriter iw = null;

      // 쿼리 설정 가져오기
      IndexConfig indexConfig = new IndexConfig();
      HashMap sqlHm = indexConfig.getQueryInfo("BOARD");
      String sql = "";
      String fileQuery = "";

      if (type.equals("all")) {
         sql = TextEdit.trim(sqlHm.get("news_all"));
         fileQuery = TextEdit.trim(sqlHm.get("fileQuery"));
      } else if (type.equals("day")) {
         sql = TextEdit.trim(sqlHm.get("news_day"));
      }

      try {

         // DB연결 설정
         conn = ConnectionPool.getConnection("default");
         stmt = conn.createStatement();

         logger.info("sql : " + sql);

         rs = stmt.executeQuery(sql);
         logger.info("rs 접속");
         // 색인 WRITE 설정
         String indexFolder = Config.getTrinityInxDB() + File.separator + "news";
         String taxoFolder = Config.getTrinityTaxoDB() + File.separator + "news";
         File IndexChkDir = new File(indexFolder);
         File TaxoChkDir = new File(taxoFolder);
        
         System.out.println( IndexChkDir.getPath() );
         System.out.println( TaxoChkDir.getPath() );

         Directory indexDir = FSDirectory.open(IndexChkDir);
         Directory taxoDir = FSDirectory.open(TaxoChkDir);

         Analyzer analyzer = new StandardAnalyzer(Version.ITRINITY_40);
         IndexWriterConfig iwc = new IndexWriterConfig(Version.ITRINITY_40, analyzer);

         if (type.equals("all")) {
            iwc.setOpenMode(OpenMode.CREATE);
            taxo = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
         } else {
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            taxo = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE_OR_APPEND);
         }

         iw = new IndexWriter(indexDir, iwc);
         // 메모리 단위로 설정해서 색인
         iwc.setRAMBufferSizeMB(1024);

         // 검색 옵션 초기화
         String search_field[] = { "keyword_field_regdate", "keyword_author", "keyword_newscompany", "keyword_j_seq",
               "unstored_field_subject", "unstored_field_desc", "unstored_field_attatch" };
         String sort_field[] = { "keyword_field_subject", "keyword_field_regdate" };
         String facet_Field[] = { "" };
         // 기존의 옵션 삭제
         Term term = new Term("search_key", "index_info");
         iw.deleteDocuments(term);
         // 옵션 재 등록
         Document info_doc = setIndexInfo(search_field, sort_field, facet_Field);
         iw.addDocument(info_doc);

         int i = 1;
         while (rs.next()) {

            String j_seq = TextEdit.trim(rs.getString("J_SEQ"));
            String field_subject = TextEdit.trim(rs.getString("FIELD_SUBJECT"));
            String field_desc = TextEdit.trim(rs.getString("FIELD_DESC"));
            String author = TextEdit.trim(rs.getString("AUTHOR"));
            String newscompany = TextEdit.trim(rs.getString("NEWSCOMPANY"));
            String category = TextEdit.trim(rs.getString("TAB_CATEGORY"));
            String url = TextEdit.trim(rs.getString("URL"));
            String field_regdate = TextEdit.trim(rs.getString("FIELD_REGDATE"));
            // 첨부파일 처리용
            //String atch_file_id = TextEdit.trim(rs.getString("ATCH_FILE_ID"));

            // 첨부파일 처리
            //String attach_list = "";
            //String field_attatch = "";

            //HashMap tempHm = getAttachData(atch_file_id, fileQuery, conn);
            //attach_list = TextEdit.trim(tempHm.get("attach_list"));
            //field_attatch = TextEdit.trim(tempHm.get("field_attach"));

            HashMap hm = new HashMap();

            hm.put("j_seq", j_seq);
            hm.put("field_subject", field_subject);
            hm.put("field_desc", field_desc);
            hm.put("author", author);
            hm.put("newscompany", newscompany);
            hm.put("category", category);
            hm.put("url", url);
            hm.put("field_regdate", field_regdate);

//            System.out.println(hm.toString());

            // facet 필드 설정.
            CategoryPath field_category = new CategoryPath("facet_keyword", "a");
            List<CategoryPath> facetList = categoryPathArrayToList(field_category);
            CategoryDocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo)
                  .setCategoryPaths(facetList);

            Document doc = makeIndexing(hm);
            categoryDocBuilder.build(doc);

            iw.addDocument(doc);

            if (i % 10000 == 0)
               logger.info(i + "건 색인중...");
            i++;

         }
         logger.info("전체 " + i + "건 색인 완료.");
         taxo.commit();
         iw.commit();
      } catch (Exception e) {
         logger.error("callbackIndexing error! : " + e.toString());
         logger.error("query : " + sql);
      } finally {
         if (rs != null)
            try {
               rs.close();
            } catch (Exception e) {
            }
         if (stmt != null)
            try {
               stmt.close();
            } catch (Exception e) {
            }
         if (iw != null)
            try {
               iw.close();
            } catch (IOException ex) {
            }
         if (taxo != null)
            try {
               taxo.close();
            } catch (IOException ex) {
            }
      }

      logger.info("NewsIndex(새소식) End!!!");
   }

   @Override
   public Document makeIndexing(HashMap hm) throws Exception {
      // TODO Auto-generated method stub
      Document document = new Document();
      try {

         // 키값
         String j_seq = TextEdit.trim(hm.get("j_seq"));

         // 제목
         String field_subject = TextEdit.trim(hm.get("field_subject"));

         // 내용
         String field_desc = TextEdit.trim(hm.get("field_desc"));

         // 작성자
         String author = TextEdit.trim(hm.get("author"));
         // 소속
         String newscompany = TextEdit.trim(hm.get("newscompany"));
         // 카테고리
         String category = TextEdit.trim(hm.get("category"));
         // url
         String url = TextEdit.trim(hm.get("url"));
         // 날짜
         String field_regdate = TextEdit.trim(hm.get("field_regdate"));

         // 첨부파일
         String field_attatch = TextEdit.trim(hm.get("field_attatch"));
         String attach_list = TextEdit.trim(hm.get("attach_list"));

/////////////////////////////////////////////////////////////////////////////////////////////////////////////

         // 키값
         document.add(IndexUtil.Keyword("keyword_j_seq", j_seq));
         document.add(IndexUtil.Stroed("stored_j_seq", j_seq));

         // 제목색인
         document.add(IndexUtil.Keyword("keyword_field_subject", field_subject));
         document.add(IndexUtil.UnStored_nGramALL("unstored_field_subject", field_subject));
         document.add(IndexUtil.Stroed("stored_field_subject", field_subject));

         // 내용색인
         document.add(IndexUtil.UnStored_nGram("unstored_field_desc", field_desc));
         document.add(IndexUtil.Stroed("stored_field_desc", field_desc));

         // 작성자
         document.add(IndexUtil.Keyword("keyword_author", author));
         document.add(IndexUtil.Stroed("stored_author", author));

         // 소속
         document.add(IndexUtil.Keyword("keyword_newscompany", newscompany));
         document.add(IndexUtil.Stroed("stored_newscompany", newscompany));

         // 카테고리
         document.add(IndexUtil.Keyword("keyword_category", category));
         document.add(IndexUtil.Stroed("stored_category", category));

         // url
         document.add(IndexUtil.Keyword("keyword_url", url));
         document.add(IndexUtil.Stroed("stored_url", url));

         // 날짜
         document.add(IndexUtil.Keyword("keyword_field_regdate", field_regdate));
         document.add(IndexUtil.Stroed("stored_field_regdate", field_regdate));
         document.add(IndexUtil.Stroed("stored_view_regdate", field_regdate.replaceAll("-", ".")));

         // 첨부파일
         // document.add(IndexUtil.UnStored_nGram("unstored_field_attatch",
         // field_attatch));
         // document.add(IndexUtil.Stroed("stored_field_attatch", field_attatch));
         // document.add(IndexUtil.Stroed("stored_attach_list", attach_list));

      } catch (Exception e) {
         throw new Exception("색인 문서 생성 에러 " + InternalException.getExceptionMessage(e));
      }
      return document;
   }

   public static HashMap getAttachData(String atch_file_id, String fileQuery, Connection conn) {
      HashMap hm = new HashMap();
      Statement stmt = null;
      ResultSet rs = null;

      String sql = fileQuery.replace("@@ATCH_FILE_ID", atch_file_id);

      try {
         stmt = conn.createStatement();
         rs = stmt.executeQuery(sql);

         String attach_list = "";
         String field_attach = "";

         while (rs.next()) {
            String file_stre_cours = TextEdit.trim(rs.getString("FILE_STRE_COURS"));
            String stre_file_nm = TextEdit.trim(rs.getString("STRE_FILE_NM"));
            String orignl_file_nm = TextEdit.trim(rs.getString("ORIGNL_FILE_NM"));
            String file_extsn = TextEdit.trim(rs.getString("FILE_EXTSN"));
            String creat_dt = TextEdit.trim(rs.getString("CREAT_DT"));

            String orgPath = file_stre_cours + File.separator + stre_file_nm;
            String txtPath = Config.getTrinityTxt() + creat_dt + File.separator + atch_file_id + File.separator
                  + stre_file_nm + ".txt";

            File orgFileChk = new File(orgPath);
            if (orgFileChk.exists()) {

               String txtDir = Config.getTrinityTxt() + creat_dt + File.separator + atch_file_id;

               File TxtPathChkDir = new File(txtDir);
               if (!TxtPathChkDir.exists()) {
                  TxtPathChkDir.mkdirs();
               }

               try {
                  TextFilter.filter(orgPath, txtPath);
               } catch (Exception e) {
                  System.out.println(e.toString());
               }

            }

            attach_list = attach_list + file_extsn + "@@@" + orignl_file_nm + "###";

            try {
               field_attach = field_attach + TextEdit.getTextData(txtPath) + " " + orignl_file_nm;
            } catch (Exception e) {
            }
         }

         hm.put("attach_list", attach_list);
         hm.put("field_attach", field_attach);
      } catch (Exception e) {
         logger.error(e.toString());
      } finally {
         if (rs != null)
            try {
               rs.close();
            } catch (Exception e) {
            }
         if (stmt != null)
            try {
               stmt.close();
            } catch (Exception e) {
            }
      }
      return hm;
   }

   /*
    * 색인삭제 : 생성된 IndexWriter로 삭제처리함, 색인 중에 삭제처리가 필요할때 사용 IndexWriter.close는 없음 -
    * 색인단에서 처리
    */
   public static boolean deleteDocumentOne(kr.co.iTrinity4.index.IndexWriter writer, String field_name,
         String field_value) {

      boolean result = false;

      try {
         writer.deleteDocuments(new Term(field_name, field_value));
         result = true;
      } catch (Exception e) {
         logger.info("색인 삭제 에러");
         logger.info(e.toString());
      }
      return result;
   }

}